aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks103
-rw-r--r--kernel/Makefile85
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c144
-rw-r--r--kernel/audit.h11
-rw-r--r--kernel/audit_watch.c5
-rw-r--r--kernel/auditfilter.c202
-rw-r--r--kernel/auditsc.c612
-rw-r--r--kernel/cgroup.c1088
-rw-r--r--kernel/cgroup_freezer.c506
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/context_tracking.c83
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/cpuset.c122
-rw-r--r--kernel/cred.c164
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c33
-rw-r--r--kernel/debug/kdb/kdb_main.c33
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c326
-rw-r--r--kernel/events/hw_breakpoint.c12
-rw-r--r--kernel/events/internal.h82
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/events/uprobes.c588
-rw-r--r--kernel/exit.c206
-rw-r--r--kernel/fork.c203
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/futex.c59
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/irqdomain.c33
-rw-r--r--kernel/irq/manage.c43
-rw-r--r--kernel/irq/resend.c8
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/kcmp.c1
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c13
-rw-r--r--kernel/kprobes.c247
-rw-r--r--kernel/ksysfs.c23
-rw-r--r--kernel/kthread.c188
-rw-r--r--kernel/lockdep.c39
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c104
-rw-r--r--kernel/module-internal.h14
-rw-r--r--kernel/module.c578
-rw-r--r--kernel/module_signing.c249
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/pid.c67
-rw-r--r--kernel/pid_namespace.c146
-rw-r--r--kernel/posix-cpu-timers.c27
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c15
-rw-r--r--kernel/power/qos.c66
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk.c53
-rw-r--r--kernel/profile.c7
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c7
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h15
-rw-r--r--kernel/rcutorture.c213
-rw-r--r--kernel/rcutree.c1122
-rw-r--r--kernel/rcutree.h117
-rw-r--r--kernel/rcutree_plugin.h1012
-rw-r--r--kernel/rcutree_trace.c340
-rw-r--r--kernel/res_counter.c42
-rw-r--r--kernel/resource.c50
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c901
-rw-r--r--kernel/sched/cputime.c589
-rw-r--r--kernel/sched/debug.c36
-rw-r--r--kernel/sched/fair.c1211
-rw-r--r--kernel/sched/features.h26
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h139
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/signal.c130
-rw-r--r--kernel/smpboot.c233
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c117
-rw-r--r--kernel/srcu.c20
-rw-r--r--kernel/sys.c35
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c70
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c111
-rw-r--r--kernel/taskstats.c39
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c118
-rw-r--r--kernel/time/clockevents.c24
-rw-r--r--kernel/time/jiffies.c40
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c143
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c183
-rw-r--r--kernel/timer.c118
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile8
-rw-r--r--kernel/trace/ftrace.c336
-rw-r--r--kernel/trace/ring_buffer.c73
-rw-r--r--kernel/trace/trace.c494
-rw-r--r--kernel/trace/trace.h23
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/trace_events.c167
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_irqsoff.c21
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_output.c78
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c17
-rw-r--r--kernel/trace/trace_selftest.c305
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_syscalls.c63
-rw-r--r--kernel/trace/trace_uprobe.c12
-rw-r--r--kernel/tsacct.c12
-rw-r--r--kernel/user.c10
-rw-r--r--kernel/user_namespace.c275
-rw-r--r--kernel/utsname.c34
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c276
-rw-r--r--kernel/workqueue.c1235
134 files changed, 11155 insertions, 6708 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 2251882daf53..44511d100eaa 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool 88 bool
89 89
90config UNINLINE_SPIN_UNLOCK
91 bool
92
90# 93#
91# lock_* functions are inlined when: 94# lock_* functions are inlined when:
92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y 95# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y 106# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
104# 107#
105 108
109if !DEBUG_SPINLOCK
110
106config INLINE_SPIN_TRYLOCK 111config INLINE_SPIN_TRYLOCK
107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK 112 def_bool y
113 depends on ARCH_INLINE_SPIN_TRYLOCK
108 114
109config INLINE_SPIN_TRYLOCK_BH 115config INLINE_SPIN_TRYLOCK_BH
110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH 116 def_bool y
117 depends on ARCH_INLINE_SPIN_TRYLOCK_BH
111 118
112config INLINE_SPIN_LOCK 119config INLINE_SPIN_LOCK
113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK 120 def_bool y
121 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
114 122
115config INLINE_SPIN_LOCK_BH 123config INLINE_SPIN_LOCK_BH
116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 124 def_bool y
117 ARCH_INLINE_SPIN_LOCK_BH 125 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
118 126
119config INLINE_SPIN_LOCK_IRQ 127config INLINE_SPIN_LOCK_IRQ
120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 128 def_bool y
121 ARCH_INLINE_SPIN_LOCK_IRQ 129 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
122 130
123config INLINE_SPIN_LOCK_IRQSAVE 131config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 132 def_bool y
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE 133 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config UNINLINE_SPIN_UNLOCK
128 bool
129 134
130config INLINE_SPIN_UNLOCK_BH 135config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH 136 def_bool y
137 depends on ARCH_INLINE_SPIN_UNLOCK_BH
132 138
133config INLINE_SPIN_UNLOCK_IRQ 139config INLINE_SPIN_UNLOCK_IRQ
134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) 140 def_bool y
141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH
135 142
136config INLINE_SPIN_UNLOCK_IRQRESTORE 143config INLINE_SPIN_UNLOCK_IRQRESTORE
137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE 144 def_bool y
145 depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
138 146
139 147
140config INLINE_READ_TRYLOCK 148config INLINE_READ_TRYLOCK
141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK 149 def_bool y
150 depends on ARCH_INLINE_READ_TRYLOCK
142 151
143config INLINE_READ_LOCK 152config INLINE_READ_LOCK
144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK 153 def_bool y
154 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
145 155
146config INLINE_READ_LOCK_BH 156config INLINE_READ_LOCK_BH
147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 157 def_bool y
148 ARCH_INLINE_READ_LOCK_BH 158 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
149 159
150config INLINE_READ_LOCK_IRQ 160config INLINE_READ_LOCK_IRQ
151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 161 def_bool y
152 ARCH_INLINE_READ_LOCK_IRQ 162 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
153 163
154config INLINE_READ_LOCK_IRQSAVE 164config INLINE_READ_LOCK_IRQSAVE
155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 165 def_bool y
156 ARCH_INLINE_READ_LOCK_IRQSAVE 166 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
157 167
158config INLINE_READ_UNLOCK 168config INLINE_READ_UNLOCK
159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) 169 def_bool y
170 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
160 171
161config INLINE_READ_UNLOCK_BH 172config INLINE_READ_UNLOCK_BH
162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH 173 def_bool y
174 depends on ARCH_INLINE_READ_UNLOCK_BH
163 175
164config INLINE_READ_UNLOCK_IRQ 176config INLINE_READ_UNLOCK_IRQ
165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) 177 def_bool y
178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH
166 179
167config INLINE_READ_UNLOCK_IRQRESTORE 180config INLINE_READ_UNLOCK_IRQRESTORE
168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE 181 def_bool y
182 depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
169 183
170 184
171config INLINE_WRITE_TRYLOCK 185config INLINE_WRITE_TRYLOCK
172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK 186 def_bool y
187 depends on ARCH_INLINE_WRITE_TRYLOCK
173 188
174config INLINE_WRITE_LOCK 189config INLINE_WRITE_LOCK
175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK 190 def_bool y
191 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
176 192
177config INLINE_WRITE_LOCK_BH 193config INLINE_WRITE_LOCK_BH
178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 194 def_bool y
179 ARCH_INLINE_WRITE_LOCK_BH 195 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
180 196
181config INLINE_WRITE_LOCK_IRQ 197config INLINE_WRITE_LOCK_IRQ
182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 198 def_bool y
183 ARCH_INLINE_WRITE_LOCK_IRQ 199 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
184 200
185config INLINE_WRITE_LOCK_IRQSAVE 201config INLINE_WRITE_LOCK_IRQSAVE
186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 202 def_bool y
187 ARCH_INLINE_WRITE_LOCK_IRQSAVE 203 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
188 204
189config INLINE_WRITE_UNLOCK 205config INLINE_WRITE_UNLOCK
190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) 206 def_bool y
207 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
191 208
192config INLINE_WRITE_UNLOCK_BH 209config INLINE_WRITE_UNLOCK_BH
193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH 210 def_bool y
211 depends on ARCH_INLINE_WRITE_UNLOCK_BH
194 212
195config INLINE_WRITE_UNLOCK_IRQ 213config INLINE_WRITE_UNLOCK_IRQ
196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) 214 def_bool y
215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH
197 216
198config INLINE_WRITE_UNLOCK_IRQRESTORE 217config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 218 def_bool y
219 depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
220
221endif
200 222
201config MUTEX_SPIN_ON_OWNER 223config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES 224 def_bool y
225 depends on SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764c..6c072b6da239 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
48obj-$(CONFIG_SMP) += smp.o 48obj-$(CONFIG_SMP) += smp.o
49obj-$(CONFIG_SMP) += smpboot.o
50ifneq ($(CONFIG_SMP),y) 49ifneq ($(CONFIG_SMP),y)
51obj-y += up.o 50obj-y += up.o
52endif 51endif
@@ -55,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
55obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
56obj-$(CONFIG_UID16) += uid16.o 55obj-$(CONFIG_UID16) += uid16.o
57obj-$(CONFIG_MODULES) += module.o 56obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 58obj-$(CONFIG_KALLSYMS) += kallsyms.o
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 60obj-$(CONFIG_KEXEC) += kexec.o
@@ -98,7 +98,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
98obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o 98obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
99obj-$(CONFIG_FUNCTION_TRACER) += trace/ 99obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_TRACE_CLOCK) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 103obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 104obj-$(CONFIG_IRQ_WORK) += irq_work.o
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 110obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 112obj-$(CONFIG_JUMP_LABEL) += jump_label.o
113obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113 114
114$(obj)/configs.o: $(obj)/config_data.h 115$(obj)/configs.o: $(obj)/config_data.h
115 116
@@ -131,3 +132,81 @@ quiet_cmd_timeconst = TIMEC $@
131targets += timeconst.h 132targets += timeconst.h
132$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 133$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
133 $(call if_changed,timeconst) 134 $(call if_changed,timeconst)
135
136ifeq ($(CONFIG_MODULE_SIG),y)
137#
138# Pull the signing certificate and any extra certificates into the kernel
139#
140
141quiet_cmd_touch = TOUCH $@
142 cmd_touch = touch $@
143
144extra_certificates:
145 $(call cmd,touch)
146
147kernel/modsign_certificate.o: signing_key.x509 extra_certificates
148
149###############################################################################
150#
151# If module signing is requested, say by allyesconfig, but a key has not been
152# supplied, then one will need to be generated to make sure the build does not
153# fail and that the kernel may be used afterwards.
154#
155###############################################################################
156sign_key_with_hash :=
157ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
158sign_key_with_hash := -sha1
159endif
160ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
161sign_key_with_hash := -sha224
162endif
163ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
164sign_key_with_hash := -sha256
165endif
166ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
167sign_key_with_hash := -sha384
168endif
169ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
170sign_key_with_hash := -sha512
171endif
172ifeq ($(sign_key_with_hash),)
173$(error Could not determine digest type to use from kernel config)
174endif
175
176signing_key.priv signing_key.x509: x509.genkey
177 @echo "###"
178 @echo "### Now generating an X.509 key pair to be used for signing modules."
179 @echo "###"
180 @echo "### If this takes a long time, you might wish to run rngd in the"
181 @echo "### background to keep the supply of entropy topped up. It"
182 @echo "### needs to be run as root, and uses a hardware random"
183 @echo "### number generator if one is available."
184 @echo "###"
185 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
186 -x509 -config x509.genkey \
187 -outform DER -out signing_key.x509 \
188 -keyout signing_key.priv
189 @echo "###"
190 @echo "### Key pair generated."
191 @echo "###"
192
193x509.genkey:
194 @echo Generating X.509 key generation config
195 @echo >x509.genkey "[ req ]"
196 @echo >>x509.genkey "default_bits = 4096"
197 @echo >>x509.genkey "distinguished_name = req_distinguished_name"
198 @echo >>x509.genkey "prompt = no"
199 @echo >>x509.genkey "string_mask = utf8only"
200 @echo >>x509.genkey "x509_extensions = myexts"
201 @echo >>x509.genkey
202 @echo >>x509.genkey "[ req_distinguished_name ]"
203 @echo >>x509.genkey "O = Magrathea"
204 @echo >>x509.genkey "CN = Glacier signing key"
205 @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
206 @echo >>x509.genkey
207 @echo >>x509.genkey "[ myexts ]"
208 @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
209 @echo >>x509.genkey "keyUsage=digitalSignature"
210 @echo >>x509.genkey "subjectKeyIdentifier=hash"
211 @echo >>x509.genkey "authorityKeyIdentifier=keyid"
212endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b0..051e071a06e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
193 } 193 }
194} 194}
195 195
196static int acct_on(char *name) 196static int acct_on(struct filename *pathname)
197{ 197{
198 struct file *file; 198 struct file *file;
199 struct vfsmount *mnt; 199 struct vfsmount *mnt;
@@ -201,7 +201,7 @@ static int acct_on(char *name)
201 struct bsd_acct_struct *acct = NULL; 201 struct bsd_acct_struct *acct = NULL;
202 202
203 /* Difference from BSD - they don't do O_APPEND */ 203 /* Difference from BSD - they don't do O_APPEND */
204 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
260 return -EPERM; 260 return -EPERM;
261 261
262 if (name) { 262 if (name) {
263 char *tmp = getname(name); 263 struct filename *tmp = getname(name);
264 if (IS_ERR(tmp)) 264 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 265 return (PTR_ERR(tmp));
266 error = acct_on(tmp); 266 error = acct_on(tmp);
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
507 do_div(elapsed, AHZ); 507 do_div(elapsed, AHZ);
508 ac.ac_btime = get_seconds() - elapsed; 508 ac.ac_btime = get_seconds() - elapsed;
509 /* we really need to bite the bullet and change layout */ 509 /* we really need to bite the bullet and change layout */
510 ac.ac_uid = orig_cred->uid; 510 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
511 ac.ac_gid = orig_cred->gid; 511 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
512#if ACCT_VERSION==2 512#if ACCT_VERSION==2
513 ac.ac_ahz = AHZ; 513 ac.ac_ahz = AHZ;
514#endif 514#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index ea3b7b6191c7..40414e9143db 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,6 +61,7 @@
61#include <linux/netlink.h> 61#include <linux/netlink.h>
62#include <linux/freezer.h> 62#include <linux/freezer.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/pid_namespace.h>
64 65
65#include "audit.h" 66#include "audit.h"
66 67
@@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
87 88
88/* 89/*
89 * If audit records are to be written to the netlink socket, audit_pid 90 * If audit records are to be written to the netlink socket, audit_pid
90 * contains the pid of the auditd process and audit_nlk_pid contains 91 * contains the pid of the auditd process and audit_nlk_portid contains
91 * the pid to use to send netlink messages to that process. 92 * the portid to use to send netlink messages to that process.
92 */ 93 */
93int audit_pid; 94int audit_pid;
94static int audit_nlk_pid; 95static int audit_nlk_portid;
95 96
96/* If audit_rate_limit is non-zero, limit the rate of sending audit records 97/* If audit_rate_limit is non-zero, limit the rate of sending audit records
97 * to that number per second. This prevents DoS attacks, but results in 98 * to that number per second. This prevents DoS attacks, but results in
@@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ;
104static int audit_backlog_wait_overflow = 0; 105static int audit_backlog_wait_overflow = 0;
105 106
106/* The identity of the user shutting down the audit system. */ 107/* The identity of the user shutting down the audit system. */
107uid_t audit_sig_uid = -1; 108kuid_t audit_sig_uid = INVALID_UID;
108pid_t audit_sig_pid = -1; 109pid_t audit_sig_pid = -1;
109u32 audit_sig_sid = 0; 110u32 audit_sig_sid = 0;
110 111
@@ -264,7 +265,7 @@ void audit_log_lost(const char *message)
264} 265}
265 266
266static int audit_log_config_change(char *function_name, int new, int old, 267static int audit_log_config_change(char *function_name, int new, int old,
267 uid_t loginuid, u32 sessionid, u32 sid, 268 kuid_t loginuid, u32 sessionid, u32 sid,
268 int allow_changes) 269 int allow_changes)
269{ 270{
270 struct audit_buffer *ab; 271 struct audit_buffer *ab;
@@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
272 273
273 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
274 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 275 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
275 old, loginuid, sessionid); 276 old, from_kuid(&init_user_ns, loginuid), sessionid);
276 if (sid) { 277 if (sid) {
277 char *ctx = NULL; 278 char *ctx = NULL;
278 u32 len; 279 u32 len;
@@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
292} 293}
293 294
294static int audit_do_config_change(char *function_name, int *to_change, 295static int audit_do_config_change(char *function_name, int *to_change,
295 int new, uid_t loginuid, u32 sessionid, 296 int new, kuid_t loginuid, u32 sessionid,
296 u32 sid) 297 u32 sid)
297{ 298{
298 int allow_changes, rc = 0, old = *to_change; 299 int allow_changes, rc = 0, old = *to_change;
@@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
319 return rc; 320 return rc;
320} 321}
321 322
322static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, 323static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
323 u32 sid) 324 u32 sid)
324{ 325{
325 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 326 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
326 limit, loginuid, sessionid, sid); 327 limit, loginuid, sessionid, sid);
327} 328}
328 329
329static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, 330static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
330 u32 sid) 331 u32 sid)
331{ 332{
332 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 333 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
333 limit, loginuid, sessionid, sid); 334 limit, loginuid, sessionid, sid);
334} 335}
335 336
336static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) 337static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
337{ 338{
338 int rc; 339 int rc;
339 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 340 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
348 return rc; 349 return rc;
349} 350}
350 351
351static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) 352static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
352{ 353{
353 if (state != AUDIT_FAIL_SILENT 354 if (state != AUDIT_FAIL_SILENT
354 && state != AUDIT_FAIL_PRINTK 355 && state != AUDIT_FAIL_PRINTK
@@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
401 int err; 402 int err;
402 /* take a reference in case we can't send it and we want to hold it */ 403 /* take a reference in case we can't send it and we want to hold it */
403 skb_get(skb); 404 skb_get(skb);
404 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 405 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
405 if (err < 0) { 406 if (err < 0) {
406 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 407 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
407 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 408 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy)
467 return 0; 468 return 0;
468} 469}
469 470
470static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
471{
472 struct task_struct *tsk;
473 int err;
474
475 rcu_read_lock();
476 tsk = find_task_by_vpid(pid);
477 if (!tsk) {
478 rcu_read_unlock();
479 return -ESRCH;
480 }
481 get_task_struct(tsk);
482 rcu_read_unlock();
483 err = tty_audit_push_task(tsk, loginuid, sessionid);
484 put_task_struct(tsk);
485 return err;
486}
487
488int audit_send_list(void *_dest) 471int audit_send_list(void *_dest)
489{ 472{
490 struct audit_netlink_list *dest = _dest; 473 struct audit_netlink_list *dest = _dest;
@@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
588{ 571{
589 int err = 0; 572 int err = 0;
590 573
574 /* Only support the initial namespaces for now. */
575 if ((current_user_ns() != &init_user_ns) ||
576 (task_active_pid_ns(current) != &init_pid_ns))
577 return -EPERM;
578
591 switch (msg_type) { 579 switch (msg_type) {
592 case AUDIT_GET: 580 case AUDIT_GET:
593 case AUDIT_LIST: 581 case AUDIT_LIST:
@@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
619} 607}
620 608
621static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 609static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
622 u32 pid, u32 uid, uid_t auid, u32 ses, 610 kuid_t auid, u32 ses, u32 sid)
623 u32 sid)
624{ 611{
625 int rc = 0; 612 int rc = 0;
626 char *ctx = NULL; 613 char *ctx = NULL;
@@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
633 620
634 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 621 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
635 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", 622 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
636 pid, uid, auid, ses); 623 task_tgid_vnr(current),
624 from_kuid(&init_user_ns, current_uid()),
625 from_kuid(&init_user_ns, auid), ses);
637 if (sid) { 626 if (sid) {
638 rc = security_secid_to_secctx(sid, &ctx, &len); 627 rc = security_secid_to_secctx(sid, &ctx, &len);
639 if (rc) 628 if (rc)
@@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
649 638
650static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 639static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
651{ 640{
652 u32 uid, pid, seq, sid; 641 u32 seq, sid;
653 void *data; 642 void *data;
654 struct audit_status *status_get, status_set; 643 struct audit_status *status_get, status_set;
655 int err; 644 int err;
656 struct audit_buffer *ab; 645 struct audit_buffer *ab;
657 u16 msg_type = nlh->nlmsg_type; 646 u16 msg_type = nlh->nlmsg_type;
658 uid_t loginuid; /* loginuid of sender */ 647 kuid_t loginuid; /* loginuid of sender */
659 u32 sessionid; 648 u32 sessionid;
660 struct audit_sig_info *sig_data; 649 struct audit_sig_info *sig_data;
661 char *ctx = NULL; 650 char *ctx = NULL;
@@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
675 return err; 664 return err;
676 } 665 }
677 666
678 pid = NETLINK_CREDS(skb)->pid;
679 uid = NETLINK_CREDS(skb)->uid;
680 loginuid = audit_get_loginuid(current); 667 loginuid = audit_get_loginuid(current);
681 sessionid = audit_get_sessionid(current); 668 sessionid = audit_get_sessionid(current);
682 security_task_getsecid(current, &sid); 669 security_task_getsecid(current, &sid);
@@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
692 status_set.backlog_limit = audit_backlog_limit; 679 status_set.backlog_limit = audit_backlog_limit;
693 status_set.lost = atomic_read(&audit_lost); 680 status_set.lost = atomic_read(&audit_lost);
694 status_set.backlog = skb_queue_len(&audit_skb_queue); 681 status_set.backlog = skb_queue_len(&audit_skb_queue);
695 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, 682 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
696 &status_set, sizeof(status_set)); 683 &status_set, sizeof(status_set));
697 break; 684 break;
698 case AUDIT_SET: 685 case AUDIT_SET:
@@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
720 sessionid, sid, 1); 707 sessionid, sid, 1);
721 708
722 audit_pid = new_pid; 709 audit_pid = new_pid;
723 audit_nlk_pid = NETLINK_CB(skb).pid; 710 audit_nlk_portid = NETLINK_CB(skb).portid;
724 } 711 }
725 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 712 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
726 err = audit_set_rate_limit(status_get->rate_limit, 713 err = audit_set_rate_limit(status_get->rate_limit,
@@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
738 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 725 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
739 return 0; 726 return 0;
740 727
741 err = audit_filter_user(&NETLINK_CB(skb)); 728 err = audit_filter_user();
742 if (err == 1) { 729 if (err == 1) {
743 err = 0; 730 err = 0;
744 if (msg_type == AUDIT_USER_TTY) { 731 if (msg_type == AUDIT_USER_TTY) {
745 err = audit_prepare_user_tty(pid, loginuid, 732 err = tty_audit_push_task(current, loginuid,
746 sessionid); 733 sessionid);
747 if (err) 734 if (err)
748 break; 735 break;
749 } 736 }
750 audit_log_common_recv_msg(&ab, msg_type, pid, uid, 737 audit_log_common_recv_msg(&ab, msg_type,
751 loginuid, sessionid, sid); 738 loginuid, sessionid, sid);
752 739
753 if (msg_type != AUDIT_USER_TTY) 740 if (msg_type != AUDIT_USER_TTY)
@@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
763 size--; 750 size--;
764 audit_log_n_untrustedstring(ab, data, size); 751 audit_log_n_untrustedstring(ab, data, size);
765 } 752 }
766 audit_set_pid(ab, pid); 753 audit_set_pid(ab, NETLINK_CB(skb).portid);
767 audit_log_end(ab); 754 audit_log_end(ab);
768 } 755 }
769 break; 756 break;
@@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
772 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 759 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
773 return -EINVAL; 760 return -EINVAL;
774 if (audit_enabled == AUDIT_LOCKED) { 761 if (audit_enabled == AUDIT_LOCKED) {
775 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 762 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
776 uid, loginuid, sessionid, sid); 763 loginuid, sessionid, sid);
777 764
778 audit_log_format(ab, " audit_enabled=%d res=0", 765 audit_log_format(ab, " audit_enabled=%d res=0",
779 audit_enabled); 766 audit_enabled);
@@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
782 } 769 }
783 /* fallthrough */ 770 /* fallthrough */
784 case AUDIT_LIST: 771 case AUDIT_LIST:
785 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, 772 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
786 uid, seq, data, nlmsg_len(nlh), 773 seq, data, nlmsg_len(nlh),
787 loginuid, sessionid, sid); 774 loginuid, sessionid, sid);
788 break; 775 break;
789 case AUDIT_ADD_RULE: 776 case AUDIT_ADD_RULE:
@@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
791 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 778 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
792 return -EINVAL; 779 return -EINVAL;
793 if (audit_enabled == AUDIT_LOCKED) { 780 if (audit_enabled == AUDIT_LOCKED) {
794 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 781 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
795 uid, loginuid, sessionid, sid); 782 loginuid, sessionid, sid);
796 783
797 audit_log_format(ab, " audit_enabled=%d res=0", 784 audit_log_format(ab, " audit_enabled=%d res=0",
798 audit_enabled); 785 audit_enabled);
@@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
801 } 788 }
802 /* fallthrough */ 789 /* fallthrough */
803 case AUDIT_LIST_RULES: 790 case AUDIT_LIST_RULES:
804 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, 791 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
805 uid, seq, data, nlmsg_len(nlh), 792 seq, data, nlmsg_len(nlh),
806 loginuid, sessionid, sid); 793 loginuid, sessionid, sid);
807 break; 794 break;
808 case AUDIT_TRIM: 795 case AUDIT_TRIM:
809 audit_trim_trees(); 796 audit_trim_trees();
810 797
811 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 798 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
812 uid, loginuid, sessionid, sid); 799 loginuid, sessionid, sid);
813 800
814 audit_log_format(ab, " op=trim res=1"); 801 audit_log_format(ab, " op=trim res=1");
815 audit_log_end(ab); 802 audit_log_end(ab);
@@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
840 /* OK, here comes... */ 827 /* OK, here comes... */
841 err = audit_tag_tree(old, new); 828 err = audit_tag_tree(old, new);
842 829
843 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 830 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
844 uid, loginuid, sessionid, sid); 831 loginuid, sessionid, sid);
845 832
846 audit_log_format(ab, " op=make_equiv old="); 833 audit_log_format(ab, " op=make_equiv old=");
847 audit_log_untrustedstring(ab, old); 834 audit_log_untrustedstring(ab, old);
@@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
866 security_release_secctx(ctx, len); 853 security_release_secctx(ctx, len);
867 return -ENOMEM; 854 return -ENOMEM;
868 } 855 }
869 sig_data->uid = audit_sig_uid; 856 sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
870 sig_data->pid = audit_sig_pid; 857 sig_data->pid = audit_sig_pid;
871 if (audit_sig_sid) { 858 if (audit_sig_sid) {
872 memcpy(sig_data->ctx, ctx, len); 859 memcpy(sig_data->ctx, ctx, len);
873 security_release_secctx(ctx, len); 860 security_release_secctx(ctx, len);
874 } 861 }
875 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 862 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
876 0, 0, sig_data, sizeof(*sig_data) + len); 863 0, 0, sig_data, sizeof(*sig_data) + len);
877 kfree(sig_data); 864 kfree(sig_data);
878 break; 865 break;
879 case AUDIT_TTY_GET: { 866 case AUDIT_TTY_GET: {
880 struct audit_tty_status s; 867 struct audit_tty_status s;
881 struct task_struct *tsk; 868 struct task_struct *tsk = current;
882 unsigned long flags; 869
883 870 spin_lock_irq(&tsk->sighand->siglock);
884 rcu_read_lock(); 871 s.enabled = tsk->signal->audit_tty != 0;
885 tsk = find_task_by_vpid(pid); 872 spin_unlock_irq(&tsk->sighand->siglock);
886 if (tsk && lock_task_sighand(tsk, &flags)) { 873
887 s.enabled = tsk->signal->audit_tty != 0; 874 audit_send_reply(NETLINK_CB(skb).portid, seq,
888 unlock_task_sighand(tsk, &flags); 875 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
889 } else
890 err = -ESRCH;
891 rcu_read_unlock();
892
893 if (!err)
894 audit_send_reply(NETLINK_CB(skb).pid, seq,
895 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 876 break;
897 } 877 }
898 case AUDIT_TTY_SET: { 878 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 879 struct audit_tty_status *s;
900 struct task_struct *tsk; 880 struct task_struct *tsk = current;
901 unsigned long flags;
902 881
903 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 882 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
904 return -EINVAL; 883 return -EINVAL;
905 s = data; 884 s = data;
906 if (s->enabled != 0 && s->enabled != 1) 885 if (s->enabled != 0 && s->enabled != 1)
907 return -EINVAL; 886 return -EINVAL;
908 rcu_read_lock(); 887
909 tsk = find_task_by_vpid(pid); 888 spin_lock_irq(&tsk->sighand->siglock);
910 if (tsk && lock_task_sighand(tsk, &flags)) { 889 tsk->signal->audit_tty = s->enabled != 0;
911 tsk->signal->audit_tty = s->enabled != 0; 890 spin_unlock_irq(&tsk->sighand->siglock);
912 unlock_task_sighand(tsk, &flags);
913 } else
914 err = -ESRCH;
915 rcu_read_unlock();
916 break; 891 break;
917 } 892 }
918 default: 893 default:
@@ -971,8 +946,7 @@ static int __init audit_init(void)
971 946
972 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 947 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
973 audit_default ? "enabled" : "disabled"); 948 audit_default ? "enabled" : "disabled");
974 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 949 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
975 THIS_MODULE, &cfg);
976 if (!audit_sock) 950 if (!audit_sock)
977 audit_panic("cannot initialize netlink socket"); 951 audit_panic("cannot initialize netlink socket");
978 else 952 else
@@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)
1466 1440
1467 ab = audit_log_start(current->audit_context, GFP_KERNEL, 1441 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1468 AUDIT_ANOM_LINK); 1442 AUDIT_ANOM_LINK);
1443 if (!ab)
1444 return;
1469 audit_log_format(ab, "op=%s action=denied", operation); 1445 audit_log_format(ab, "op=%s action=denied", operation);
1470 audit_log_format(ab, " pid=%d comm=", current->pid); 1446 audit_log_format(ab, " pid=%d comm=", current->pid);
1471 audit_log_untrustedstring(ab, current->comm); 1447 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/audit.h b/kernel/audit.h
index 816766803371..d51cba868e1b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino)
74 return (ino & (AUDIT_INODE_BUCKETS-1)); 74 return (ino & (AUDIT_INODE_BUCKETS-1));
75} 75}
76 76
77/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1
79
77extern int audit_match_class(int class, unsigned syscall); 80extern int audit_match_class(int class, unsigned syscall);
78extern int audit_comparator(const u32 left, const u32 op, const u32 right); 81extern int audit_comparator(const u32 left, const u32 op, const u32 right);
79extern int audit_compare_dname_path(const char *dname, const char *path, 82extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
80 int *dirlen); 83extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
84extern int parent_len(const char *path);
85extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
81extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 86extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
82 int done, int multi, 87 int done, int multi,
83 const void *payload, int size); 88 const void *payload, int size);
@@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *);
144extern char *audit_unpack_string(void **, size_t *, size_t); 149extern char *audit_unpack_string(void **, size_t *, size_t);
145 150
146extern pid_t audit_sig_pid; 151extern pid_t audit_sig_pid;
147extern uid_t audit_sig_uid; 152extern kuid_t audit_sig_uid;
148extern u32 audit_sig_sid; 153extern u32 audit_sig_sid;
149 154
150#ifdef CONFIG_AUDITSYSCALL 155#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index a66affc1c12c..4a599f699adc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
241 struct audit_buffer *ab; 241 struct audit_buffer *ab;
242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
243 audit_log_format(ab, "auid=%u ses=%u op=", 243 audit_log_format(ab, "auid=%u ses=%u op=",
244 audit_get_loginuid(current), 244 from_kuid(&init_user_ns, audit_get_loginuid(current)),
245 audit_get_sessionid(current)); 245 audit_get_sessionid(current));
246 audit_log_string(ab, op); 246 audit_log_string(ab, op);
247 audit_log_format(ab, " path="); 247 audit_log_format(ab, " path=");
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,
265 /* Run all of the watches on this parent looking for the one that 265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */ 266 * matches the given dname */
267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
268 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path,
269 AUDIT_NAME_FULL))
269 continue; 270 continue;
270 271
271 /* If the update involves invalidating rules, do the inode-based 272 /* If the update involves invalidating rules, do the inode-based
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd206..7f19f23d38a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
342 342
343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
344 f->val = rule->values[i]; 344 f->val = rule->values[i];
345 f->uid = INVALID_UID;
346 f->gid = INVALID_GID;
345 347
346 err = -EINVAL; 348 err = -EINVAL;
347 if (f->op == Audit_bad) 349 if (f->op == Audit_bad)
@@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
350 switch(f->type) { 352 switch(f->type) {
351 default: 353 default:
352 goto exit_free; 354 goto exit_free;
353 case AUDIT_PID:
354 case AUDIT_UID: 355 case AUDIT_UID:
355 case AUDIT_EUID: 356 case AUDIT_EUID:
356 case AUDIT_SUID: 357 case AUDIT_SUID:
357 case AUDIT_FSUID: 358 case AUDIT_FSUID:
359 case AUDIT_LOGINUID:
360 /* bit ops not implemented for uid comparisons */
361 if (f->op == Audit_bitmask || f->op == Audit_bittest)
362 goto exit_free;
363
364 f->uid = make_kuid(current_user_ns(), f->val);
365 if (!uid_valid(f->uid))
366 goto exit_free;
367 break;
358 case AUDIT_GID: 368 case AUDIT_GID:
359 case AUDIT_EGID: 369 case AUDIT_EGID:
360 case AUDIT_SGID: 370 case AUDIT_SGID:
361 case AUDIT_FSGID: 371 case AUDIT_FSGID:
362 case AUDIT_LOGINUID: 372 /* bit ops not implemented for gid comparisons */
373 if (f->op == Audit_bitmask || f->op == Audit_bittest)
374 goto exit_free;
375
376 f->gid = make_kgid(current_user_ns(), f->val);
377 if (!gid_valid(f->gid))
378 goto exit_free;
379 break;
380 case AUDIT_PID:
363 case AUDIT_PERS: 381 case AUDIT_PERS:
364 case AUDIT_MSGTYPE: 382 case AUDIT_MSGTYPE:
365 case AUDIT_PPID: 383 case AUDIT_PPID:
@@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
437 455
438 f->type = data->fields[i]; 456 f->type = data->fields[i];
439 f->val = data->values[i]; 457 f->val = data->values[i];
458 f->uid = INVALID_UID;
459 f->gid = INVALID_GID;
440 f->lsm_str = NULL; 460 f->lsm_str = NULL;
441 f->lsm_rule = NULL; 461 f->lsm_rule = NULL;
442 switch(f->type) { 462 switch(f->type) {
443 case AUDIT_PID:
444 case AUDIT_UID: 463 case AUDIT_UID:
445 case AUDIT_EUID: 464 case AUDIT_EUID:
446 case AUDIT_SUID: 465 case AUDIT_SUID:
447 case AUDIT_FSUID: 466 case AUDIT_FSUID:
467 case AUDIT_LOGINUID:
468 case AUDIT_OBJ_UID:
469 /* bit ops not implemented for uid comparisons */
470 if (f->op == Audit_bitmask || f->op == Audit_bittest)
471 goto exit_free;
472
473 f->uid = make_kuid(current_user_ns(), f->val);
474 if (!uid_valid(f->uid))
475 goto exit_free;
476 break;
448 case AUDIT_GID: 477 case AUDIT_GID:
449 case AUDIT_EGID: 478 case AUDIT_EGID:
450 case AUDIT_SGID: 479 case AUDIT_SGID:
451 case AUDIT_FSGID: 480 case AUDIT_FSGID:
452 case AUDIT_LOGINUID: 481 case AUDIT_OBJ_GID:
482 /* bit ops not implemented for gid comparisons */
483 if (f->op == Audit_bitmask || f->op == Audit_bittest)
484 goto exit_free;
485
486 f->gid = make_kgid(current_user_ns(), f->val);
487 if (!gid_valid(f->gid))
488 goto exit_free;
489 break;
490 case AUDIT_PID:
453 case AUDIT_PERS: 491 case AUDIT_PERS:
454 case AUDIT_MSGTYPE: 492 case AUDIT_MSGTYPE:
455 case AUDIT_PPID: 493 case AUDIT_PPID:
@@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
461 case AUDIT_ARG1: 499 case AUDIT_ARG1:
462 case AUDIT_ARG2: 500 case AUDIT_ARG2:
463 case AUDIT_ARG3: 501 case AUDIT_ARG3:
464 case AUDIT_OBJ_UID:
465 case AUDIT_OBJ_GID:
466 break; 502 break;
467 case AUDIT_ARCH: 503 case AUDIT_ARCH:
468 entry->rule.arch_f = f; 504 entry->rule.arch_f = f;
@@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
707 if (strcmp(a->filterkey, b->filterkey)) 743 if (strcmp(a->filterkey, b->filterkey))
708 return 1; 744 return 1;
709 break; 745 break;
746 case AUDIT_UID:
747 case AUDIT_EUID:
748 case AUDIT_SUID:
749 case AUDIT_FSUID:
750 case AUDIT_LOGINUID:
751 case AUDIT_OBJ_UID:
752 if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
753 return 1;
754 break;
755 case AUDIT_GID:
756 case AUDIT_EGID:
757 case AUDIT_SGID:
758 case AUDIT_FSGID:
759 case AUDIT_OBJ_GID:
760 if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
761 return 1;
762 break;
710 default: 763 default:
711 if (a->fields[i].val != b->fields[i].val) 764 if (a->fields[i].val != b->fields[i].val)
712 return 1; 765 return 1;
@@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1056} 1109}
1057 1110
1058/* Log rule additions and removals */ 1111/* Log rule additions and removals */
1059static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, 1112static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1060 char *action, struct audit_krule *rule, 1113 char *action, struct audit_krule *rule,
1061 int res) 1114 int res)
1062{ 1115{
@@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1068 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1121 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1069 if (!ab) 1122 if (!ab)
1070 return; 1123 return;
1071 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); 1124 audit_log_format(ab, "auid=%u ses=%u",
1125 from_kuid(&init_user_ns, loginuid), sessionid);
1072 if (sid) { 1126 if (sid) {
1073 char *ctx = NULL; 1127 char *ctx = NULL;
1074 u32 len; 1128 u32 len;
@@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1098 * @sessionid: sessionid for netlink audit message 1152 * @sessionid: sessionid for netlink audit message
1099 * @sid: SE Linux Security ID of sender 1153 * @sid: SE Linux Security ID of sender
1100 */ 1154 */
1101int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1155int audit_receive_filter(int type, int pid, int seq, void *data,
1102 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) 1156 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
1103{ 1157{
1104 struct task_struct *tsk; 1158 struct task_struct *tsk;
1105 struct audit_netlink_list *dest; 1159 struct audit_netlink_list *dest;
@@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right)
1198 } 1252 }
1199} 1253}
1200 1254
1201/* Compare given dentry name with last component in given path, 1255int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
1202 * return of 0 indicates a match. */
1203int audit_compare_dname_path(const char *dname, const char *path,
1204 int *dirlen)
1205{ 1256{
1206 int dlen, plen; 1257 switch (op) {
1207 const char *p; 1258 case Audit_equal:
1259 return uid_eq(left, right);
1260 case Audit_not_equal:
1261 return !uid_eq(left, right);
1262 case Audit_lt:
1263 return uid_lt(left, right);
1264 case Audit_le:
1265 return uid_lte(left, right);
1266 case Audit_gt:
1267 return uid_gt(left, right);
1268 case Audit_ge:
1269 return uid_gte(left, right);
1270 case Audit_bitmask:
1271 case Audit_bittest:
1272 default:
1273 BUG();
1274 return 0;
1275 }
1276}
1208 1277
1209 if (!dname || !path) 1278int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
1210 return 1; 1279{
1280 switch (op) {
1281 case Audit_equal:
1282 return gid_eq(left, right);
1283 case Audit_not_equal:
1284 return !gid_eq(left, right);
1285 case Audit_lt:
1286 return gid_lt(left, right);
1287 case Audit_le:
1288 return gid_lte(left, right);
1289 case Audit_gt:
1290 return gid_gt(left, right);
1291 case Audit_ge:
1292 return gid_gte(left, right);
1293 case Audit_bitmask:
1294 case Audit_bittest:
1295 default:
1296 BUG();
1297 return 0;
1298 }
1299}
1300
1301/**
1302 * parent_len - find the length of the parent portion of a pathname
1303 * @path: pathname of which to determine length
1304 */
1305int parent_len(const char *path)
1306{
1307 int plen;
1308 const char *p;
1211 1309
1212 dlen = strlen(dname);
1213 plen = strlen(path); 1310 plen = strlen(path);
1214 if (plen < dlen) 1311
1215 return 1; 1312 if (plen == 0)
1313 return plen;
1216 1314
1217 /* disregard trailing slashes */ 1315 /* disregard trailing slashes */
1218 p = path + plen - 1; 1316 p = path + plen - 1;
1219 while ((*p == '/') && (p > path)) 1317 while ((*p == '/') && (p > path))
1220 p--; 1318 p--;
1221 1319
1222 /* find last path component */ 1320 /* walk backward until we find the next slash or hit beginning */
1223 p = p - dlen + 1; 1321 while ((*p != '/') && (p > path))
1224 if (p < path) 1322 p--;
1323
1324 /* did we find a slash? Then increment to include it in path */
1325 if (*p == '/')
1326 p++;
1327
1328 return p - path;
1329}
1330
1331/**
1332 * audit_compare_dname_path - compare given dentry name with last component in
1333 * given path. Return of 0 indicates a match.
1334 * @dname: dentry name that we're comparing
1335 * @path: full pathname that we're comparing
1336 * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
1337 * here indicates that we must compute this value.
1338 */
1339int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
1340{
1341 int dlen, pathlen;
1342 const char *p;
1343
1344 dlen = strlen(dname);
1345 pathlen = strlen(path);
1346 if (pathlen < dlen)
1225 return 1; 1347 return 1;
1226 else if (p > path) {
1227 if (*--p != '/')
1228 return 1;
1229 else
1230 p++;
1231 }
1232 1348
1233 /* return length of path's directory component */ 1349 parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
1234 if (dirlen) 1350 if (pathlen - parentlen != dlen)
1235 *dirlen = p - path; 1351 return 1;
1352
1353 p = path + parentlen;
1354
1236 return strncmp(p, dname, dlen); 1355 return strncmp(p, dname, dlen);
1237} 1356}
1238 1357
1239static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1358static int audit_filter_user_rules(struct audit_krule *rule,
1240 struct audit_krule *rule,
1241 enum audit_state *state) 1359 enum audit_state *state)
1242{ 1360{
1243 int i; 1361 int i;
@@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1249 1367
1250 switch (f->type) { 1368 switch (f->type) {
1251 case AUDIT_PID: 1369 case AUDIT_PID:
1252 result = audit_comparator(cb->creds.pid, f->op, f->val); 1370 result = audit_comparator(task_pid_vnr(current), f->op, f->val);
1253 break; 1371 break;
1254 case AUDIT_UID: 1372 case AUDIT_UID:
1255 result = audit_comparator(cb->creds.uid, f->op, f->val); 1373 result = audit_uid_comparator(current_uid(), f->op, f->uid);
1256 break; 1374 break;
1257 case AUDIT_GID: 1375 case AUDIT_GID:
1258 result = audit_comparator(cb->creds.gid, f->op, f->val); 1376 result = audit_gid_comparator(current_gid(), f->op, f->gid);
1259 break; 1377 break;
1260 case AUDIT_LOGINUID: 1378 case AUDIT_LOGINUID:
1261 result = audit_comparator(audit_get_loginuid(current), 1379 result = audit_uid_comparator(audit_get_loginuid(current),
1262 f->op, f->val); 1380 f->op, f->uid);
1263 break; 1381 break;
1264 case AUDIT_SUBJ_USER: 1382 case AUDIT_SUBJ_USER:
1265 case AUDIT_SUBJ_ROLE: 1383 case AUDIT_SUBJ_ROLE:
@@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1287 return 1; 1405 return 1;
1288} 1406}
1289 1407
1290int audit_filter_user(struct netlink_skb_parms *cb) 1408int audit_filter_user(void)
1291{ 1409{
1292 enum audit_state state = AUDIT_DISABLED; 1410 enum audit_state state = AUDIT_DISABLED;
1293 struct audit_entry *e; 1411 struct audit_entry *e;
@@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb)
1295 1413
1296 rcu_read_lock(); 1414 rcu_read_lock();
1297 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1415 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1298 if (audit_filter_user_rules(cb, &e->rule, &state)) { 1416 if (audit_filter_user_rules(&e->rule, &state)) {
1299 if (state == AUDIT_DISABLED) 1417 if (state == AUDIT_DISABLED)
1300 ret = 0; 1418 ret = 0;
1301 break; 1419 break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4b96415527b8..e37e6a12c5e3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -81,9 +81,6 @@
81 * a name dynamically and also add those to the list anchored by names_list. */ 81 * a name dynamically and also add those to the list anchored by names_list. */
82#define AUDIT_NAMES 5 82#define AUDIT_NAMES 5
83 83
84/* Indicates that audit should log the full pathname. */
85#define AUDIT_NAME_FULL -1
86
87/* no execve audit message should be longer than this (userspace limits) */ 84/* no execve audit message should be longer than this (userspace limits) */
88#define MAX_EXECVE_AUDIT_LEN 7500 85#define MAX_EXECVE_AUDIT_LEN 7500
89 86
@@ -106,27 +103,29 @@ struct audit_cap_data {
106 * we don't let putname() free it (instead we free all of the saved 103 * we don't let putname() free it (instead we free all of the saved
107 * pointers at syscall exit time). 104 * pointers at syscall exit time).
108 * 105 *
109 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 106 * Further, in fs/namei.c:path_lookup() we store the inode and device.
107 */
110struct audit_names { 108struct audit_names {
111 struct list_head list; /* audit_context->names_list */ 109 struct list_head list; /* audit_context->names_list */
112 const char *name; 110 struct filename *name;
113 unsigned long ino; 111 unsigned long ino;
114 dev_t dev; 112 dev_t dev;
115 umode_t mode; 113 umode_t mode;
116 uid_t uid; 114 kuid_t uid;
117 gid_t gid; 115 kgid_t gid;
118 dev_t rdev; 116 dev_t rdev;
119 u32 osid; 117 u32 osid;
120 struct audit_cap_data fcap; 118 struct audit_cap_data fcap;
121 unsigned int fcap_ver; 119 unsigned int fcap_ver;
122 int name_len; /* number of name's characters to log */ 120 int name_len; /* number of name's characters to log */
123 bool name_put; /* call __putname() for this name */ 121 unsigned char type; /* record type */
122 bool name_put; /* call __putname() for this name */
124 /* 123 /*
125 * This was an allocated audit_names and not from the array of 124 * This was an allocated audit_names and not from the array of
126 * names allocated in the task audit context. Thus this name 125 * names allocated in the task audit context. Thus this name
127 * should be freed on syscall exit 126 * should be freed on syscall exit
128 */ 127 */
129 bool should_free; 128 bool should_free;
130}; 129};
131 130
132struct audit_aux_data { 131struct audit_aux_data {
@@ -149,8 +148,8 @@ struct audit_aux_data_execve {
149struct audit_aux_data_pids { 148struct audit_aux_data_pids {
150 struct audit_aux_data d; 149 struct audit_aux_data d;
151 pid_t target_pid[AUDIT_AUX_PIDS]; 150 pid_t target_pid[AUDIT_AUX_PIDS];
152 uid_t target_auid[AUDIT_AUX_PIDS]; 151 kuid_t target_auid[AUDIT_AUX_PIDS];
153 uid_t target_uid[AUDIT_AUX_PIDS]; 152 kuid_t target_uid[AUDIT_AUX_PIDS];
154 unsigned int target_sessionid[AUDIT_AUX_PIDS]; 153 unsigned int target_sessionid[AUDIT_AUX_PIDS];
155 u32 target_sid[AUDIT_AUX_PIDS]; 154 u32 target_sid[AUDIT_AUX_PIDS];
156 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; 155 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -201,21 +200,20 @@ struct audit_context {
201 struct list_head names_list; /* anchor for struct audit_names->list */ 200 struct list_head names_list; /* anchor for struct audit_names->list */
202 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
203 struct path pwd; 202 struct path pwd;
204 struct audit_context *previous; /* For nested syscalls */
205 struct audit_aux_data *aux; 203 struct audit_aux_data *aux;
206 struct audit_aux_data *aux_pids; 204 struct audit_aux_data *aux_pids;
207 struct sockaddr_storage *sockaddr; 205 struct sockaddr_storage *sockaddr;
208 size_t sockaddr_len; 206 size_t sockaddr_len;
209 /* Save things to print about task_struct */ 207 /* Save things to print about task_struct */
210 pid_t pid, ppid; 208 pid_t pid, ppid;
211 uid_t uid, euid, suid, fsuid; 209 kuid_t uid, euid, suid, fsuid;
212 gid_t gid, egid, sgid, fsgid; 210 kgid_t gid, egid, sgid, fsgid;
213 unsigned long personality; 211 unsigned long personality;
214 int arch; 212 int arch;
215 213
216 pid_t target_pid; 214 pid_t target_pid;
217 uid_t target_auid; 215 kuid_t target_auid;
218 uid_t target_uid; 216 kuid_t target_uid;
219 unsigned int target_sessionid; 217 unsigned int target_sessionid;
220 u32 target_sid; 218 u32 target_sid;
221 char target_comm[TASK_COMM_LEN]; 219 char target_comm[TASK_COMM_LEN];
@@ -231,8 +229,8 @@ struct audit_context {
231 long args[6]; 229 long args[6];
232 } socketcall; 230 } socketcall;
233 struct { 231 struct {
234 uid_t uid; 232 kuid_t uid;
235 gid_t gid; 233 kgid_t gid;
236 umode_t mode; 234 umode_t mode;
237 u32 osid; 235 u32 osid;
238 int has_perm; 236 int has_perm;
@@ -464,37 +462,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
464 return 0; 462 return 0;
465} 463}
466 464
467static int audit_compare_id(uid_t uid1, 465static int audit_compare_uid(kuid_t uid,
468 struct audit_names *name, 466 struct audit_names *name,
469 unsigned long name_offset, 467 struct audit_field *f,
470 struct audit_field *f, 468 struct audit_context *ctx)
471 struct audit_context *ctx)
472{ 469{
473 struct audit_names *n; 470 struct audit_names *n;
474 unsigned long addr;
475 uid_t uid2;
476 int rc; 471 int rc;
477 472
478 BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
479
480 if (name) { 473 if (name) {
481 addr = (unsigned long)name; 474 rc = audit_uid_comparator(uid, f->op, name->uid);
482 addr += name_offset;
483
484 uid2 = *(uid_t *)addr;
485 rc = audit_comparator(uid1, f->op, uid2);
486 if (rc) 475 if (rc)
487 return rc; 476 return rc;
488 } 477 }
489 478
490 if (ctx) { 479 if (ctx) {
491 list_for_each_entry(n, &ctx->names_list, list) { 480 list_for_each_entry(n, &ctx->names_list, list) {
492 addr = (unsigned long)n; 481 rc = audit_uid_comparator(uid, f->op, n->uid);
493 addr += name_offset; 482 if (rc)
494 483 return rc;
495 uid2 = *(uid_t *)addr; 484 }
485 }
486 return 0;
487}
496 488
497 rc = audit_comparator(uid1, f->op, uid2); 489static int audit_compare_gid(kgid_t gid,
490 struct audit_names *name,
491 struct audit_field *f,
492 struct audit_context *ctx)
493{
494 struct audit_names *n;
495 int rc;
496
497 if (name) {
498 rc = audit_gid_comparator(gid, f->op, name->gid);
499 if (rc)
500 return rc;
501 }
502
503 if (ctx) {
504 list_for_each_entry(n, &ctx->names_list, list) {
505 rc = audit_gid_comparator(gid, f->op, n->gid);
498 if (rc) 506 if (rc)
499 return rc; 507 return rc;
500 } 508 }
@@ -511,80 +519,62 @@ static int audit_field_compare(struct task_struct *tsk,
511 switch (f->val) { 519 switch (f->val) {
512 /* process to file object comparisons */ 520 /* process to file object comparisons */
513 case AUDIT_COMPARE_UID_TO_OBJ_UID: 521 case AUDIT_COMPARE_UID_TO_OBJ_UID:
514 return audit_compare_id(cred->uid, 522 return audit_compare_uid(cred->uid, name, f, ctx);
515 name, offsetof(struct audit_names, uid),
516 f, ctx);
517 case AUDIT_COMPARE_GID_TO_OBJ_GID: 523 case AUDIT_COMPARE_GID_TO_OBJ_GID:
518 return audit_compare_id(cred->gid, 524 return audit_compare_gid(cred->gid, name, f, ctx);
519 name, offsetof(struct audit_names, gid),
520 f, ctx);
521 case AUDIT_COMPARE_EUID_TO_OBJ_UID: 525 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
522 return audit_compare_id(cred->euid, 526 return audit_compare_uid(cred->euid, name, f, ctx);
523 name, offsetof(struct audit_names, uid),
524 f, ctx);
525 case AUDIT_COMPARE_EGID_TO_OBJ_GID: 527 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
526 return audit_compare_id(cred->egid, 528 return audit_compare_gid(cred->egid, name, f, ctx);
527 name, offsetof(struct audit_names, gid),
528 f, ctx);
529 case AUDIT_COMPARE_AUID_TO_OBJ_UID: 529 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
530 return audit_compare_id(tsk->loginuid, 530 return audit_compare_uid(tsk->loginuid, name, f, ctx);
531 name, offsetof(struct audit_names, uid),
532 f, ctx);
533 case AUDIT_COMPARE_SUID_TO_OBJ_UID: 531 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
534 return audit_compare_id(cred->suid, 532 return audit_compare_uid(cred->suid, name, f, ctx);
535 name, offsetof(struct audit_names, uid),
536 f, ctx);
537 case AUDIT_COMPARE_SGID_TO_OBJ_GID: 533 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
538 return audit_compare_id(cred->sgid, 534 return audit_compare_gid(cred->sgid, name, f, ctx);
539 name, offsetof(struct audit_names, gid),
540 f, ctx);
541 case AUDIT_COMPARE_FSUID_TO_OBJ_UID: 535 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
542 return audit_compare_id(cred->fsuid, 536 return audit_compare_uid(cred->fsuid, name, f, ctx);
543 name, offsetof(struct audit_names, uid),
544 f, ctx);
545 case AUDIT_COMPARE_FSGID_TO_OBJ_GID: 537 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
546 return audit_compare_id(cred->fsgid, 538 return audit_compare_gid(cred->fsgid, name, f, ctx);
547 name, offsetof(struct audit_names, gid),
548 f, ctx);
549 /* uid comparisons */ 539 /* uid comparisons */
550 case AUDIT_COMPARE_UID_TO_AUID: 540 case AUDIT_COMPARE_UID_TO_AUID:
551 return audit_comparator(cred->uid, f->op, tsk->loginuid); 541 return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
552 case AUDIT_COMPARE_UID_TO_EUID: 542 case AUDIT_COMPARE_UID_TO_EUID:
553 return audit_comparator(cred->uid, f->op, cred->euid); 543 return audit_uid_comparator(cred->uid, f->op, cred->euid);
554 case AUDIT_COMPARE_UID_TO_SUID: 544 case AUDIT_COMPARE_UID_TO_SUID:
555 return audit_comparator(cred->uid, f->op, cred->suid); 545 return audit_uid_comparator(cred->uid, f->op, cred->suid);
556 case AUDIT_COMPARE_UID_TO_FSUID: 546 case AUDIT_COMPARE_UID_TO_FSUID:
557 return audit_comparator(cred->uid, f->op, cred->fsuid); 547 return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
558 /* auid comparisons */ 548 /* auid comparisons */
559 case AUDIT_COMPARE_AUID_TO_EUID: 549 case AUDIT_COMPARE_AUID_TO_EUID:
560 return audit_comparator(tsk->loginuid, f->op, cred->euid); 550 return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
561 case AUDIT_COMPARE_AUID_TO_SUID: 551 case AUDIT_COMPARE_AUID_TO_SUID:
562 return audit_comparator(tsk->loginuid, f->op, cred->suid); 552 return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
563 case AUDIT_COMPARE_AUID_TO_FSUID: 553 case AUDIT_COMPARE_AUID_TO_FSUID:
564 return audit_comparator(tsk->loginuid, f->op, cred->fsuid); 554 return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
565 /* euid comparisons */ 555 /* euid comparisons */
566 case AUDIT_COMPARE_EUID_TO_SUID: 556 case AUDIT_COMPARE_EUID_TO_SUID:
567 return audit_comparator(cred->euid, f->op, cred->suid); 557 return audit_uid_comparator(cred->euid, f->op, cred->suid);
568 case AUDIT_COMPARE_EUID_TO_FSUID: 558 case AUDIT_COMPARE_EUID_TO_FSUID:
569 return audit_comparator(cred->euid, f->op, cred->fsuid); 559 return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
570 /* suid comparisons */ 560 /* suid comparisons */
571 case AUDIT_COMPARE_SUID_TO_FSUID: 561 case AUDIT_COMPARE_SUID_TO_FSUID:
572 return audit_comparator(cred->suid, f->op, cred->fsuid); 562 return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
573 /* gid comparisons */ 563 /* gid comparisons */
574 case AUDIT_COMPARE_GID_TO_EGID: 564 case AUDIT_COMPARE_GID_TO_EGID:
575 return audit_comparator(cred->gid, f->op, cred->egid); 565 return audit_gid_comparator(cred->gid, f->op, cred->egid);
576 case AUDIT_COMPARE_GID_TO_SGID: 566 case AUDIT_COMPARE_GID_TO_SGID:
577 return audit_comparator(cred->gid, f->op, cred->sgid); 567 return audit_gid_comparator(cred->gid, f->op, cred->sgid);
578 case AUDIT_COMPARE_GID_TO_FSGID: 568 case AUDIT_COMPARE_GID_TO_FSGID:
579 return audit_comparator(cred->gid, f->op, cred->fsgid); 569 return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
580 /* egid comparisons */ 570 /* egid comparisons */
581 case AUDIT_COMPARE_EGID_TO_SGID: 571 case AUDIT_COMPARE_EGID_TO_SGID:
582 return audit_comparator(cred->egid, f->op, cred->sgid); 572 return audit_gid_comparator(cred->egid, f->op, cred->sgid);
583 case AUDIT_COMPARE_EGID_TO_FSGID: 573 case AUDIT_COMPARE_EGID_TO_FSGID:
584 return audit_comparator(cred->egid, f->op, cred->fsgid); 574 return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
585 /* sgid comparison */ 575 /* sgid comparison */
586 case AUDIT_COMPARE_SGID_TO_FSGID: 576 case AUDIT_COMPARE_SGID_TO_FSGID:
587 return audit_comparator(cred->sgid, f->op, cred->fsgid); 577 return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
588 default: 578 default:
589 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); 579 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
590 return 0; 580 return 0;
@@ -630,28 +620,28 @@ static int audit_filter_rules(struct task_struct *tsk,
630 } 620 }
631 break; 621 break;
632 case AUDIT_UID: 622 case AUDIT_UID:
633 result = audit_comparator(cred->uid, f->op, f->val); 623 result = audit_uid_comparator(cred->uid, f->op, f->uid);
634 break; 624 break;
635 case AUDIT_EUID: 625 case AUDIT_EUID:
636 result = audit_comparator(cred->euid, f->op, f->val); 626 result = audit_uid_comparator(cred->euid, f->op, f->uid);
637 break; 627 break;
638 case AUDIT_SUID: 628 case AUDIT_SUID:
639 result = audit_comparator(cred->suid, f->op, f->val); 629 result = audit_uid_comparator(cred->suid, f->op, f->uid);
640 break; 630 break;
641 case AUDIT_FSUID: 631 case AUDIT_FSUID:
642 result = audit_comparator(cred->fsuid, f->op, f->val); 632 result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
643 break; 633 break;
644 case AUDIT_GID: 634 case AUDIT_GID:
645 result = audit_comparator(cred->gid, f->op, f->val); 635 result = audit_gid_comparator(cred->gid, f->op, f->gid);
646 break; 636 break;
647 case AUDIT_EGID: 637 case AUDIT_EGID:
648 result = audit_comparator(cred->egid, f->op, f->val); 638 result = audit_gid_comparator(cred->egid, f->op, f->gid);
649 break; 639 break;
650 case AUDIT_SGID: 640 case AUDIT_SGID:
651 result = audit_comparator(cred->sgid, f->op, f->val); 641 result = audit_gid_comparator(cred->sgid, f->op, f->gid);
652 break; 642 break;
653 case AUDIT_FSGID: 643 case AUDIT_FSGID:
654 result = audit_comparator(cred->fsgid, f->op, f->val); 644 result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
655 break; 645 break;
656 case AUDIT_PERS: 646 case AUDIT_PERS:
657 result = audit_comparator(tsk->personality, f->op, f->val); 647 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -717,10 +707,10 @@ static int audit_filter_rules(struct task_struct *tsk,
717 break; 707 break;
718 case AUDIT_OBJ_UID: 708 case AUDIT_OBJ_UID:
719 if (name) { 709 if (name) {
720 result = audit_comparator(name->uid, f->op, f->val); 710 result = audit_uid_comparator(name->uid, f->op, f->uid);
721 } else if (ctx) { 711 } else if (ctx) {
722 list_for_each_entry(n, &ctx->names_list, list) { 712 list_for_each_entry(n, &ctx->names_list, list) {
723 if (audit_comparator(n->uid, f->op, f->val)) { 713 if (audit_uid_comparator(n->uid, f->op, f->uid)) {
724 ++result; 714 ++result;
725 break; 715 break;
726 } 716 }
@@ -729,10 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk,
729 break; 719 break;
730 case AUDIT_OBJ_GID: 720 case AUDIT_OBJ_GID:
731 if (name) { 721 if (name) {
732 result = audit_comparator(name->gid, f->op, f->val); 722 result = audit_gid_comparator(name->gid, f->op, f->gid);
733 } else if (ctx) { 723 } else if (ctx) {
734 list_for_each_entry(n, &ctx->names_list, list) { 724 list_for_each_entry(n, &ctx->names_list, list) {
735 if (audit_comparator(n->gid, f->op, f->val)) { 725 if (audit_gid_comparator(n->gid, f->op, f->gid)) {
736 ++result; 726 ++result;
737 break; 727 break;
738 } 728 }
@@ -750,7 +740,7 @@ static int audit_filter_rules(struct task_struct *tsk,
750 case AUDIT_LOGINUID: 740 case AUDIT_LOGINUID:
751 result = 0; 741 result = 0;
752 if (ctx) 742 if (ctx)
753 result = audit_comparator(tsk->loginuid, f->op, f->val); 743 result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
754 break; 744 break;
755 case AUDIT_SUBJ_USER: 745 case AUDIT_SUBJ_USER:
756 case AUDIT_SUBJ_ROLE: 746 case AUDIT_SUBJ_ROLE:
@@ -1006,7 +996,7 @@ static inline void audit_free_names(struct audit_context *context)
1006 context->ino_count); 996 context->ino_count);
1007 list_for_each_entry(n, &context->names_list, list) { 997 list_for_each_entry(n, &context->names_list, list) {
1008 printk(KERN_ERR "names[%d] = %p = %s\n", i, 998 printk(KERN_ERR "names[%d] = %p = %s\n", i,
1009 n->name, n->name ?: "(null)"); 999 n->name, n->name->name ?: "(null)");
1010 } 1000 }
1011 dump_stack(); 1001 dump_stack();
1012 return; 1002 return;
@@ -1100,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk)
1100 1090
1101static inline void audit_free_context(struct audit_context *context) 1091static inline void audit_free_context(struct audit_context *context)
1102{ 1092{
1103 struct audit_context *previous; 1093 audit_free_names(context);
1104 int count = 0; 1094 unroll_tree_refs(context, NULL, 0);
1105 1095 free_tree_refs(context);
1106 do { 1096 audit_free_aux(context);
1107 previous = context->previous; 1097 kfree(context->filterkey);
1108 if (previous || (count && count < 10)) { 1098 kfree(context->sockaddr);
1109 ++count; 1099 kfree(context);
1110 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
1111 " freeing multiple contexts (%d)\n",
1112 context->serial, context->major,
1113 context->name_count, count);
1114 }
1115 audit_free_names(context);
1116 unroll_tree_refs(context, NULL, 0);
1117 free_tree_refs(context);
1118 audit_free_aux(context);
1119 kfree(context->filterkey);
1120 kfree(context->sockaddr);
1121 kfree(context);
1122 context = previous;
1123 } while (context);
1124 if (count >= 10)
1125 printk(KERN_ERR "audit: freed %d contexts\n", count);
1126} 1100}
1127 1101
1128void audit_log_task_context(struct audit_buffer *ab) 1102void audit_log_task_context(struct audit_buffer *ab)
@@ -1154,13 +1128,43 @@ error_path:
1154 1128
1155EXPORT_SYMBOL(audit_log_task_context); 1129EXPORT_SYMBOL(audit_log_task_context);
1156 1130
1157static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 1131void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1158{ 1132{
1133 const struct cred *cred;
1159 char name[sizeof(tsk->comm)]; 1134 char name[sizeof(tsk->comm)];
1160 struct mm_struct *mm = tsk->mm; 1135 struct mm_struct *mm = tsk->mm;
1161 struct vm_area_struct *vma; 1136 char *tty;
1137
1138 if (!ab)
1139 return;
1162 1140
1163 /* tsk == current */ 1141 /* tsk == current */
1142 cred = current_cred();
1143
1144 spin_lock_irq(&tsk->sighand->siglock);
1145 if (tsk->signal && tsk->signal->tty)
1146 tty = tsk->signal->tty->name;
1147 else
1148 tty = "(none)";
1149 spin_unlock_irq(&tsk->sighand->siglock);
1150
1151
1152 audit_log_format(ab,
1153 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1154 " euid=%u suid=%u fsuid=%u"
1155 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
1156 sys_getppid(),
1157 tsk->pid,
1158 from_kuid(&init_user_ns, tsk->loginuid),
1159 from_kuid(&init_user_ns, cred->uid),
1160 from_kgid(&init_user_ns, cred->gid),
1161 from_kuid(&init_user_ns, cred->euid),
1162 from_kuid(&init_user_ns, cred->suid),
1163 from_kuid(&init_user_ns, cred->fsuid),
1164 from_kgid(&init_user_ns, cred->egid),
1165 from_kgid(&init_user_ns, cred->sgid),
1166 from_kgid(&init_user_ns, cred->fsgid),
1167 tsk->sessionid, tty);
1164 1168
1165 get_task_comm(name, tsk); 1169 get_task_comm(name, tsk);
1166 audit_log_format(ab, " comm="); 1170 audit_log_format(ab, " comm=");
@@ -1168,23 +1172,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
1168 1172
1169 if (mm) { 1173 if (mm) {
1170 down_read(&mm->mmap_sem); 1174 down_read(&mm->mmap_sem);
1171 vma = mm->mmap; 1175 if (mm->exe_file)
1172 while (vma) { 1176 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1173 if ((vma->vm_flags & VM_EXECUTABLE) &&
1174 vma->vm_file) {
1175 audit_log_d_path(ab, " exe=",
1176 &vma->vm_file->f_path);
1177 break;
1178 }
1179 vma = vma->vm_next;
1180 }
1181 up_read(&mm->mmap_sem); 1177 up_read(&mm->mmap_sem);
1182 } 1178 }
1183 audit_log_task_context(ab); 1179 audit_log_task_context(ab);
1184} 1180}
1185 1181
1182EXPORT_SYMBOL(audit_log_task_info);
1183
1186static int audit_log_pid_context(struct audit_context *context, pid_t pid, 1184static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1187 uid_t auid, uid_t uid, unsigned int sessionid, 1185 kuid_t auid, kuid_t uid, unsigned int sessionid,
1188 u32 sid, char *comm) 1186 u32 sid, char *comm)
1189{ 1187{
1190 struct audit_buffer *ab; 1188 struct audit_buffer *ab;
@@ -1196,8 +1194,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1196 if (!ab) 1194 if (!ab)
1197 return rc; 1195 return rc;
1198 1196
1199 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, 1197 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
1200 uid, sessionid); 1198 from_kuid(&init_user_ns, auid),
1199 from_kuid(&init_user_ns, uid), sessionid);
1201 if (security_secid_to_secctx(sid, &ctx, &len)) { 1200 if (security_secid_to_secctx(sid, &ctx, &len)) {
1202 audit_log_format(ab, " obj=(none)"); 1201 audit_log_format(ab, " obj=(none)");
1203 rc = 1; 1202 rc = 1;
@@ -1447,7 +1446,9 @@ static void show_special(struct audit_context *context, int *call_panic)
1447 u32 osid = context->ipc.osid; 1446 u32 osid = context->ipc.osid;
1448 1447
1449 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", 1448 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1450 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1449 from_kuid(&init_user_ns, context->ipc.uid),
1450 from_kgid(&init_user_ns, context->ipc.gid),
1451 context->ipc.mode);
1451 if (osid) { 1452 if (osid) {
1452 char *ctx = NULL; 1453 char *ctx = NULL;
1453 u32 len; 1454 u32 len;
@@ -1536,7 +1537,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1536 case AUDIT_NAME_FULL: 1537 case AUDIT_NAME_FULL:
1537 /* log the full path */ 1538 /* log the full path */
1538 audit_log_format(ab, " name="); 1539 audit_log_format(ab, " name=");
1539 audit_log_untrustedstring(ab, n->name); 1540 audit_log_untrustedstring(ab, n->name->name);
1540 break; 1541 break;
1541 case 0: 1542 case 0:
1542 /* name was specified as a relative path and the 1543 /* name was specified as a relative path and the
@@ -1546,7 +1547,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1546 default: 1547 default:
1547 /* log the name's directory component */ 1548 /* log the name's directory component */
1548 audit_log_format(ab, " name="); 1549 audit_log_format(ab, " name=");
1549 audit_log_n_untrustedstring(ab, n->name, 1550 audit_log_n_untrustedstring(ab, n->name->name,
1550 n->name_len); 1551 n->name_len);
1551 } 1552 }
1552 } else 1553 } else
@@ -1560,8 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1560 MAJOR(n->dev), 1561 MAJOR(n->dev),
1561 MINOR(n->dev), 1562 MINOR(n->dev),
1562 n->mode, 1563 n->mode,
1563 n->uid, 1564 from_kuid(&init_user_ns, n->uid),
1564 n->gid, 1565 from_kgid(&init_user_ns, n->gid),
1565 MAJOR(n->rdev), 1566 MAJOR(n->rdev),
1566 MINOR(n->rdev)); 1567 MINOR(n->rdev));
1567 } 1568 }
@@ -1585,26 +1586,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1585 1586
1586static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1587static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1587{ 1588{
1588 const struct cred *cred;
1589 int i, call_panic = 0; 1589 int i, call_panic = 0;
1590 struct audit_buffer *ab; 1590 struct audit_buffer *ab;
1591 struct audit_aux_data *aux; 1591 struct audit_aux_data *aux;
1592 const char *tty;
1593 struct audit_names *n; 1592 struct audit_names *n;
1594 1593
1595 /* tsk == current */ 1594 /* tsk == current */
1596 context->pid = tsk->pid;
1597 if (!context->ppid)
1598 context->ppid = sys_getppid();
1599 cred = current_cred();
1600 context->uid = cred->uid;
1601 context->gid = cred->gid;
1602 context->euid = cred->euid;
1603 context->suid = cred->suid;
1604 context->fsuid = cred->fsuid;
1605 context->egid = cred->egid;
1606 context->sgid = cred->sgid;
1607 context->fsgid = cred->fsgid;
1608 context->personality = tsk->personality; 1595 context->personality = tsk->personality;
1609 1596
1610 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1597 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1619,32 +1606,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1619 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1606 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1620 context->return_code); 1607 context->return_code);
1621 1608
1622 spin_lock_irq(&tsk->sighand->siglock);
1623 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1624 tty = tsk->signal->tty->name;
1625 else
1626 tty = "(none)";
1627 spin_unlock_irq(&tsk->sighand->siglock);
1628
1629 audit_log_format(ab, 1609 audit_log_format(ab,
1630 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1610 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
1631 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1611 context->argv[0],
1632 " euid=%u suid=%u fsuid=%u" 1612 context->argv[1],
1633 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", 1613 context->argv[2],
1634 context->argv[0], 1614 context->argv[3],
1635 context->argv[1], 1615 context->name_count);
1636 context->argv[2],
1637 context->argv[3],
1638 context->name_count,
1639 context->ppid,
1640 context->pid,
1641 tsk->loginuid,
1642 context->uid,
1643 context->gid,
1644 context->euid, context->suid, context->fsuid,
1645 context->egid, context->sgid, context->fsgid, tty,
1646 tsk->sessionid);
1647
1648 1616
1649 audit_log_task_info(ab, tsk); 1617 audit_log_task_info(ab, tsk);
1650 audit_log_key(ab, context->filterkey); 1618 audit_log_key(ab, context->filterkey);
@@ -1798,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major,
1798 if (!context) 1766 if (!context)
1799 return; 1767 return;
1800 1768
1801 /*
1802 * This happens only on certain architectures that make system
1803 * calls in kernel_thread via the entry.S interface, instead of
1804 * with direct calls. (If you are porting to a new
1805 * architecture, hitting this condition can indicate that you
1806 * got the _exit/_leave calls backward in entry.S.)
1807 *
1808 * i386 no
1809 * x86_64 no
1810 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
1811 *
1812 * This also happens with vm86 emulation in a non-nested manner
1813 * (entries without exits), so this case must be caught.
1814 */
1815 if (context->in_syscall) {
1816 struct audit_context *newctx;
1817
1818#if AUDIT_DEBUG
1819 printk(KERN_ERR
1820 "audit(:%d) pid=%d in syscall=%d;"
1821 " entering syscall=%d\n",
1822 context->serial, tsk->pid, context->major, major);
1823#endif
1824 newctx = audit_alloc_context(context->state);
1825 if (newctx) {
1826 newctx->previous = context;
1827 context = newctx;
1828 tsk->audit_context = newctx;
1829 } else {
1830 /* If we can't alloc a new context, the best we
1831 * can do is to leak memory (any pending putname
1832 * will be lost). The only other alternative is
1833 * to abandon auditing. */
1834 audit_zero_context(context, context->state);
1835 }
1836 }
1837 BUG_ON(context->in_syscall || context->name_count); 1769 BUG_ON(context->in_syscall || context->name_count);
1838 1770
1839 if (!audit_enabled) 1771 if (!audit_enabled)
@@ -1896,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code)
1896 if (!list_empty(&context->killed_trees)) 1828 if (!list_empty(&context->killed_trees))
1897 audit_kill_trees(&context->killed_trees); 1829 audit_kill_trees(&context->killed_trees);
1898 1830
1899 if (context->previous) { 1831 audit_free_names(context);
1900 struct audit_context *new_context = context->previous; 1832 unroll_tree_refs(context, NULL, 0);
1901 context->previous = NULL; 1833 audit_free_aux(context);
1902 audit_free_context(context); 1834 context->aux = NULL;
1903 tsk->audit_context = new_context; 1835 context->aux_pids = NULL;
1904 } else { 1836 context->target_pid = 0;
1905 audit_free_names(context); 1837 context->target_sid = 0;
1906 unroll_tree_refs(context, NULL, 0); 1838 context->sockaddr_len = 0;
1907 audit_free_aux(context); 1839 context->type = 0;
1908 context->aux = NULL; 1840 context->fds[0] = -1;
1909 context->aux_pids = NULL; 1841 if (context->state != AUDIT_RECORD_CONTEXT) {
1910 context->target_pid = 0; 1842 kfree(context->filterkey);
1911 context->target_sid = 0; 1843 context->filterkey = NULL;
1912 context->sockaddr_len = 0;
1913 context->type = 0;
1914 context->fds[0] = -1;
1915 if (context->state != AUDIT_RECORD_CONTEXT) {
1916 kfree(context->filterkey);
1917 context->filterkey = NULL;
1918 }
1919 tsk->audit_context = context;
1920 } 1844 }
1845 tsk->audit_context = context;
1921} 1846}
1922 1847
1923static inline void handle_one(const struct inode *inode) 1848static inline void handle_one(const struct inode *inode)
@@ -2009,7 +1934,8 @@ retry:
2009#endif 1934#endif
2010} 1935}
2011 1936
2012static struct audit_names *audit_alloc_name(struct audit_context *context) 1937static struct audit_names *audit_alloc_name(struct audit_context *context,
1938 unsigned char type)
2013{ 1939{
2014 struct audit_names *aname; 1940 struct audit_names *aname;
2015 1941
@@ -2024,6 +1950,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
2024 } 1950 }
2025 1951
2026 aname->ino = (unsigned long)-1; 1952 aname->ino = (unsigned long)-1;
1953 aname->type = type;
2027 list_add_tail(&aname->list, &context->names_list); 1954 list_add_tail(&aname->list, &context->names_list);
2028 1955
2029 context->name_count++; 1956 context->name_count++;
@@ -2034,13 +1961,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
2034} 1961}
2035 1962
2036/** 1963/**
1964 * audit_reusename - fill out filename with info from existing entry
1965 * @uptr: userland ptr to pathname
1966 *
1967 * Search the audit_names list for the current audit context. If there is an
1968 * existing entry with a matching "uptr" then return the filename
1969 * associated with that audit_name. If not, return NULL.
1970 */
1971struct filename *
1972__audit_reusename(const __user char *uptr)
1973{
1974 struct audit_context *context = current->audit_context;
1975 struct audit_names *n;
1976
1977 list_for_each_entry(n, &context->names_list, list) {
1978 if (!n->name)
1979 continue;
1980 if (n->name->uptr == uptr)
1981 return n->name;
1982 }
1983 return NULL;
1984}
1985
1986/**
2037 * audit_getname - add a name to the list 1987 * audit_getname - add a name to the list
2038 * @name: name to add 1988 * @name: name to add
2039 * 1989 *
2040 * Add a name to the list of audit names for this context. 1990 * Add a name to the list of audit names for this context.
2041 * Called from fs/namei.c:getname(). 1991 * Called from fs/namei.c:getname().
2042 */ 1992 */
2043void __audit_getname(const char *name) 1993void __audit_getname(struct filename *name)
2044{ 1994{
2045 struct audit_context *context = current->audit_context; 1995 struct audit_context *context = current->audit_context;
2046 struct audit_names *n; 1996 struct audit_names *n;
@@ -2054,13 +2004,19 @@ void __audit_getname(const char *name)
2054 return; 2004 return;
2055 } 2005 }
2056 2006
2057 n = audit_alloc_name(context); 2007#if AUDIT_DEBUG
2008 /* The filename _must_ have a populated ->name */
2009 BUG_ON(!name->name);
2010#endif
2011
2012 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
2058 if (!n) 2013 if (!n)
2059 return; 2014 return;
2060 2015
2061 n->name = name; 2016 n->name = name;
2062 n->name_len = AUDIT_NAME_FULL; 2017 n->name_len = AUDIT_NAME_FULL;
2063 n->name_put = true; 2018 n->name_put = true;
2019 name->aname = n;
2064 2020
2065 if (!context->pwd.dentry) 2021 if (!context->pwd.dentry)
2066 get_fs_pwd(current->fs, &context->pwd); 2022 get_fs_pwd(current->fs, &context->pwd);
@@ -2073,7 +2029,7 @@ void __audit_getname(const char *name)
2073 * then we delay the putname until syscall exit. 2029 * then we delay the putname until syscall exit.
2074 * Called from include/linux/fs.h:putname(). 2030 * Called from include/linux/fs.h:putname().
2075 */ 2031 */
2076void audit_putname(const char *name) 2032void audit_putname(struct filename *name)
2077{ 2033{
2078 struct audit_context *context = current->audit_context; 2034 struct audit_context *context = current->audit_context;
2079 2035
@@ -2088,7 +2044,7 @@ void audit_putname(const char *name)
2088 2044
2089 list_for_each_entry(n, &context->names_list, list) 2045 list_for_each_entry(n, &context->names_list, list)
2090 printk(KERN_ERR "name[%d] = %p = %s\n", i, 2046 printk(KERN_ERR "name[%d] = %p = %s\n", i,
2091 n->name, n->name ?: "(null)"); 2047 n->name, n->name->name ?: "(null)");
2092 } 2048 }
2093#endif 2049#endif
2094 __putname(name); 2050 __putname(name);
@@ -2102,8 +2058,8 @@ void audit_putname(const char *name)
2102 " put_count=%d\n", 2058 " put_count=%d\n",
2103 __FILE__, __LINE__, 2059 __FILE__, __LINE__,
2104 context->serial, context->major, 2060 context->serial, context->major,
2105 context->in_syscall, name, context->name_count, 2061 context->in_syscall, name->name,
2106 context->put_count); 2062 context->name_count, context->put_count);
2107 dump_stack(); 2063 dump_stack();
2108 } 2064 }
2109 } 2065 }
@@ -2146,13 +2102,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
2146} 2102}
2147 2103
2148/** 2104/**
2149 * audit_inode - store the inode and device from a lookup 2105 * __audit_inode - store the inode and device from a lookup
2150 * @name: name being audited 2106 * @name: name being audited
2151 * @dentry: dentry being audited 2107 * @dentry: dentry being audited
2152 * 2108 * @parent: does this dentry represent the parent?
2153 * Called from fs/namei.c:path_lookup().
2154 */ 2109 */
2155void __audit_inode(const char *name, const struct dentry *dentry) 2110void __audit_inode(struct filename *name, const struct dentry *dentry,
2111 unsigned int parent)
2156{ 2112{
2157 struct audit_context *context = current->audit_context; 2113 struct audit_context *context = current->audit_context;
2158 const struct inode *inode = dentry->d_inode; 2114 const struct inode *inode = dentry->d_inode;
@@ -2161,24 +2117,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2161 if (!context->in_syscall) 2117 if (!context->in_syscall)
2162 return; 2118 return;
2163 2119
2120 if (!name)
2121 goto out_alloc;
2122
2123#if AUDIT_DEBUG
2124 /* The struct filename _must_ have a populated ->name */
2125 BUG_ON(!name->name);
2126#endif
2127 /*
2128 * If we have a pointer to an audit_names entry already, then we can
2129 * just use it directly if the type is correct.
2130 */
2131 n = name->aname;
2132 if (n) {
2133 if (parent) {
2134 if (n->type == AUDIT_TYPE_PARENT ||
2135 n->type == AUDIT_TYPE_UNKNOWN)
2136 goto out;
2137 } else {
2138 if (n->type != AUDIT_TYPE_PARENT)
2139 goto out;
2140 }
2141 }
2142
2164 list_for_each_entry_reverse(n, &context->names_list, list) { 2143 list_for_each_entry_reverse(n, &context->names_list, list) {
2165 if (n->name && (n->name == name)) 2144 /* does the name pointer match? */
2166 goto out; 2145 if (!n->name || n->name->name != name->name)
2146 continue;
2147
2148 /* match the correct record type */
2149 if (parent) {
2150 if (n->type == AUDIT_TYPE_PARENT ||
2151 n->type == AUDIT_TYPE_UNKNOWN)
2152 goto out;
2153 } else {
2154 if (n->type != AUDIT_TYPE_PARENT)
2155 goto out;
2156 }
2167 } 2157 }
2168 2158
2169 /* unable to find the name from a previous getname() */ 2159out_alloc:
2170 n = audit_alloc_name(context); 2160 /* unable to find the name from a previous getname(). Allocate a new
2161 * anonymous entry.
2162 */
2163 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
2171 if (!n) 2164 if (!n)
2172 return; 2165 return;
2173out: 2166out:
2167 if (parent) {
2168 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
2169 n->type = AUDIT_TYPE_PARENT;
2170 } else {
2171 n->name_len = AUDIT_NAME_FULL;
2172 n->type = AUDIT_TYPE_NORMAL;
2173 }
2174 handle_path(dentry); 2174 handle_path(dentry);
2175 audit_copy_inode(n, dentry, inode); 2175 audit_copy_inode(n, dentry, inode);
2176} 2176}
2177 2177
2178/** 2178/**
2179 * audit_inode_child - collect inode info for created/removed objects 2179 * __audit_inode_child - collect inode info for created/removed objects
2180 * @dentry: dentry being audited
2181 * @parent: inode of dentry parent 2180 * @parent: inode of dentry parent
2181 * @dentry: dentry being audited
2182 * @type: AUDIT_TYPE_* value that we're looking for
2182 * 2183 *
2183 * For syscalls that create or remove filesystem objects, audit_inode 2184 * For syscalls that create or remove filesystem objects, audit_inode
2184 * can only collect information for the filesystem object's parent. 2185 * can only collect information for the filesystem object's parent.
@@ -2188,15 +2189,14 @@ out:
2188 * must be hooked prior, in order to capture the target inode during 2189 * must be hooked prior, in order to capture the target inode during
2189 * unsuccessful attempts. 2190 * unsuccessful attempts.
2190 */ 2191 */
2191void __audit_inode_child(const struct dentry *dentry, 2192void __audit_inode_child(const struct inode *parent,
2192 const struct inode *parent) 2193 const struct dentry *dentry,
2194 const unsigned char type)
2193{ 2195{
2194 struct audit_context *context = current->audit_context; 2196 struct audit_context *context = current->audit_context;
2195 const char *found_parent = NULL, *found_child = NULL;
2196 const struct inode *inode = dentry->d_inode; 2197 const struct inode *inode = dentry->d_inode;
2197 const char *dname = dentry->d_name.name; 2198 const char *dname = dentry->d_name.name;
2198 struct audit_names *n; 2199 struct audit_names *n, *found_parent = NULL, *found_child = NULL;
2199 int dirlen = 0;
2200 2200
2201 if (!context->in_syscall) 2201 if (!context->in_syscall)
2202 return; 2202 return;
@@ -2204,62 +2204,65 @@ void __audit_inode_child(const struct dentry *dentry,
2204 if (inode) 2204 if (inode)
2205 handle_one(inode); 2205 handle_one(inode);
2206 2206
2207 /* parent is more likely, look for it first */ 2207 /* look for a parent entry first */
2208 list_for_each_entry(n, &context->names_list, list) { 2208 list_for_each_entry(n, &context->names_list, list) {
2209 if (!n->name) 2209 if (!n->name || n->type != AUDIT_TYPE_PARENT)
2210 continue; 2210 continue;
2211 2211
2212 if (n->ino == parent->i_ino && 2212 if (n->ino == parent->i_ino &&
2213 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2213 !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
2214 n->name_len = dirlen; /* update parent data in place */ 2214 found_parent = n;
2215 found_parent = n->name; 2215 break;
2216 goto add_names;
2217 } 2216 }
2218 } 2217 }
2219 2218
2220 /* no matching parent, look for matching child */ 2219 /* is there a matching child entry? */
2221 list_for_each_entry(n, &context->names_list, list) { 2220 list_for_each_entry(n, &context->names_list, list) {
2222 if (!n->name) 2221 /* can only match entries that have a name */
2222 if (!n->name || n->type != type)
2223 continue; 2223 continue;
2224 2224
2225 /* strcmp() is the more likely scenario */ 2225 /* if we found a parent, make sure this one is a child of it */
2226 if (!strcmp(dname, n->name) || 2226 if (found_parent && (n->name != found_parent->name))
2227 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2227 continue;
2228 if (inode) 2228
2229 audit_copy_inode(n, NULL, inode); 2229 if (!strcmp(dname, n->name->name) ||
2230 else 2230 !audit_compare_dname_path(dname, n->name->name,
2231 n->ino = (unsigned long)-1; 2231 found_parent ?
2232 found_child = n->name; 2232 found_parent->name_len :
2233 goto add_names; 2233 AUDIT_NAME_FULL)) {
2234 found_child = n;
2235 break;
2234 } 2236 }
2235 } 2237 }
2236 2238
2237add_names:
2238 if (!found_parent) { 2239 if (!found_parent) {
2239 n = audit_alloc_name(context); 2240 /* create a new, "anonymous" parent record */
2241 n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
2240 if (!n) 2242 if (!n)
2241 return; 2243 return;
2242 audit_copy_inode(n, NULL, parent); 2244 audit_copy_inode(n, NULL, parent);
2243 } 2245 }
2244 2246
2245 if (!found_child) { 2247 if (!found_child) {
2246 n = audit_alloc_name(context); 2248 found_child = audit_alloc_name(context, type);
2247 if (!n) 2249 if (!found_child)
2248 return; 2250 return;
2249 2251
2250 /* Re-use the name belonging to the slot for a matching parent 2252 /* Re-use the name belonging to the slot for a matching parent
2251 * directory. All names for this context are relinquished in 2253 * directory. All names for this context are relinquished in
2252 * audit_free_names() */ 2254 * audit_free_names() */
2253 if (found_parent) { 2255 if (found_parent) {
2254 n->name = found_parent; 2256 found_child->name = found_parent->name;
2255 n->name_len = AUDIT_NAME_FULL; 2257 found_child->name_len = AUDIT_NAME_FULL;
2256 /* don't call __putname() */ 2258 /* don't call __putname() */
2257 n->name_put = false; 2259 found_child->name_put = false;
2258 } 2260 }
2259
2260 if (inode)
2261 audit_copy_inode(n, NULL, inode);
2262 } 2261 }
2262 if (inode)
2263 audit_copy_inode(found_child, dentry, inode);
2264 else
2265 found_child->ino = (unsigned long)-1;
2263} 2266}
2264EXPORT_SYMBOL_GPL(__audit_inode_child); 2267EXPORT_SYMBOL_GPL(__audit_inode_child);
2265 2268
@@ -2299,14 +2302,14 @@ static atomic_t session_id = ATOMIC_INIT(0);
2299 * 2302 *
2300 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2303 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2301 */ 2304 */
2302int audit_set_loginuid(uid_t loginuid) 2305int audit_set_loginuid(kuid_t loginuid)
2303{ 2306{
2304 struct task_struct *task = current; 2307 struct task_struct *task = current;
2305 struct audit_context *context = task->audit_context; 2308 struct audit_context *context = task->audit_context;
2306 unsigned int sessionid; 2309 unsigned int sessionid;
2307 2310
2308#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE 2311#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2309 if (task->loginuid != -1) 2312 if (uid_valid(task->loginuid))
2310 return -EPERM; 2313 return -EPERM;
2311#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ 2314#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2312 if (!capable(CAP_AUDIT_CONTROL)) 2315 if (!capable(CAP_AUDIT_CONTROL))
@@ -2322,8 +2325,10 @@ int audit_set_loginuid(uid_t loginuid)
2322 audit_log_format(ab, "login pid=%d uid=%u " 2325 audit_log_format(ab, "login pid=%d uid=%u "
2323 "old auid=%u new auid=%u" 2326 "old auid=%u new auid=%u"
2324 " old ses=%u new ses=%u", 2327 " old ses=%u new ses=%u",
2325 task->pid, task_uid(task), 2328 task->pid,
2326 task->loginuid, loginuid, 2329 from_kuid(&init_user_ns, task_uid(task)),
2330 from_kuid(&init_user_ns, task->loginuid),
2331 from_kuid(&init_user_ns, loginuid),
2327 task->sessionid, sessionid); 2332 task->sessionid, sessionid);
2328 audit_log_end(ab); 2333 audit_log_end(ab);
2329 } 2334 }
@@ -2546,12 +2551,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
2546 struct audit_aux_data_pids *axp; 2551 struct audit_aux_data_pids *axp;
2547 struct task_struct *tsk = current; 2552 struct task_struct *tsk = current;
2548 struct audit_context *ctx = tsk->audit_context; 2553 struct audit_context *ctx = tsk->audit_context;
2549 uid_t uid = current_uid(), t_uid = task_uid(t); 2554 kuid_t uid = current_uid(), t_uid = task_uid(t);
2550 2555
2551 if (audit_pid && t->tgid == audit_pid) { 2556 if (audit_pid && t->tgid == audit_pid) {
2552 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2557 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2553 audit_sig_pid = tsk->pid; 2558 audit_sig_pid = tsk->pid;
2554 if (tsk->loginuid != -1) 2559 if (uid_valid(tsk->loginuid))
2555 audit_sig_uid = tsk->loginuid; 2560 audit_sig_uid = tsk->loginuid;
2556 else 2561 else
2557 audit_sig_uid = uid; 2562 audit_sig_uid = uid;
@@ -2672,8 +2677,8 @@ void __audit_mmap_fd(int fd, int flags)
2672 2677
2673static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) 2678static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2674{ 2679{
2675 uid_t auid, uid; 2680 kuid_t auid, uid;
2676 gid_t gid; 2681 kgid_t gid;
2677 unsigned int sessionid; 2682 unsigned int sessionid;
2678 2683
2679 auid = audit_get_loginuid(current); 2684 auid = audit_get_loginuid(current);
@@ -2681,7 +2686,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2681 current_uid_gid(&uid, &gid); 2686 current_uid_gid(&uid, &gid);
2682 2687
2683 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2688 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2684 auid, uid, gid, sessionid); 2689 from_kuid(&init_user_ns, auid),
2690 from_kuid(&init_user_ns, uid),
2691 from_kgid(&init_user_ns, gid),
2692 sessionid);
2685 audit_log_task_context(ab); 2693 audit_log_task_context(ab);
2686 audit_log_format(ab, " pid=%d comm=", current->pid); 2694 audit_log_format(ab, " pid=%d comm=", current->pid);
2687 audit_log_untrustedstring(ab, current->comm); 2695 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..4855892798fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
88 88
89/* 89/*
90 * Generate an array of cgroup subsystem pointers. At boot time, this is 90 * Generate an array of cgroup subsystem pointers. At boot time, this is
91 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are 91 * populated with the built in subsystems, and modular subsystems are
92 * registered after that. The mutable section of this array is protected by 92 * registered after that. The mutable section of this array is protected by
93 * cgroup_mutex. 93 * cgroup_mutex.
94 */ 94 */
95#define SUBSYS(_x) &_x ## _subsys, 95#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
96#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
96static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 97static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
97#include <linux/cgroup_subsys.h> 98#include <linux/cgroup_subsys.h>
98}; 99};
@@ -111,13 +112,13 @@ struct cgroupfs_root {
111 * The bitmask of subsystems intended to be attached to this 112 * The bitmask of subsystems intended to be attached to this
112 * hierarchy 113 * hierarchy
113 */ 114 */
114 unsigned long subsys_bits; 115 unsigned long subsys_mask;
115 116
116 /* Unique id for this hierarchy. */ 117 /* Unique id for this hierarchy. */
117 int hierarchy_id; 118 int hierarchy_id;
118 119
119 /* The bitmask of subsystems currently attached to this hierarchy */ 120 /* The bitmask of subsystems currently attached to this hierarchy */
120 unsigned long actual_subsys_bits; 121 unsigned long actual_subsys_mask;
121 122
122 /* A list running through the attached subsystems */ 123 /* A list running through the attached subsystems */
123 struct list_head subsys_list; 124 struct list_head subsys_list;
@@ -137,6 +138,9 @@ struct cgroupfs_root {
137 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
138 unsigned long flags; 139 unsigned long flags;
139 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
140 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
141 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
142 146
@@ -170,8 +174,8 @@ struct css_id {
170 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
171 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
172 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
173 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
174 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
175 */ 179 */
176 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
177 /* 181 /*
@@ -241,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
241 */ 245 */
242static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
243 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
244#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
245int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
246{ 254{
@@ -276,7 +284,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
276 284
277/* bits in struct cgroupfs_root flags field */ 285/* bits in struct cgroupfs_root flags field */
278enum { 286enum {
279 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
288 ROOT_XATTR, /* supports extended attributes */
280}; 289};
281 290
282static int cgroup_is_releasable(const struct cgroup *cgrp) 291static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -292,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
292 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
293} 302}
294 303
295static int clone_children(const struct cgroup *cgrp)
296{
297 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
298}
299
300/* 304/*
301 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
302 * an active hierarchy 306 * an active hierarchy
@@ -556,7 +560,7 @@ static struct css_set *find_existing_css_set(
556 * won't change, so no need for locking. 560 * won't change, so no need for locking.
557 */ 561 */
558 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 562 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
559 if (root->subsys_bits & (1UL << i)) { 563 if (root->subsys_mask & (1UL << i)) {
560 /* Subsystem is in this hierarchy. So we want 564 /* Subsystem is in this hierarchy. So we want
561 * the subsystem state from the new 565 * the subsystem state from the new
562 * cgroup */ 566 * cgroup */
@@ -780,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
780 * The task_lock() exception 784 * The task_lock() exception
781 * 785 *
782 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
783 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
784 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
785 * several performance critical places that need to reference 789 * several performance critical places that need to reference
786 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
787 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
788 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
789 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
790 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
791 * 795 *
@@ -824,7 +828,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 828static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); 829static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 830static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 831static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
832 unsigned long subsys_mask);
828static const struct inode_operations cgroup_dir_inode_operations; 833static const struct inode_operations cgroup_dir_inode_operations;
829static const struct file_operations proc_cgroupstats_operations; 834static const struct file_operations proc_cgroupstats_operations;
830 835
@@ -851,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
851 return inode; 856 return inode;
852} 857}
853 858
854/*
855 * Call subsys's pre_destroy handler.
856 * This is called before css refcnt check.
857 */
858static int cgroup_call_pre_destroy(struct cgroup *cgrp)
859{
860 struct cgroup_subsys *ss;
861 int ret = 0;
862
863 for_each_subsys(cgrp->root, ss) {
864 if (!ss->pre_destroy)
865 continue;
866
867 ret = ss->pre_destroy(cgrp);
868 if (ret) {
869 /* ->pre_destroy() failure is being deprecated */
870 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
871 break;
872 }
873 }
874
875 return ret;
876}
877
878static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
879{ 860{
880 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -895,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
895 * Release the subsystem state objects. 876 * Release the subsystem state objects.
896 */ 877 */
897 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
898 ss->destroy(cgrp); 879 ss->css_free(cgrp);
899 880
900 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
901 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -912,15 +893,20 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
912 */ 893 */
913 BUG_ON(!list_empty(&cgrp->pidlists)); 894 BUG_ON(!list_empty(&cgrp->pidlists));
914 895
896 simple_xattrs_free(&cgrp->xattrs);
897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
915 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
916 } else { 900 } else {
917 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
918 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 902 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
903 struct cftype *cft = cfe->type;
919 904
920 WARN_ONCE(!list_empty(&cfe->node) && 905 WARN_ONCE(!list_empty(&cfe->node) &&
921 cgrp != &cgrp->root->top_cgroup, 906 cgrp != &cgrp->root->top_cgroup,
922 "cfe still linked for %s\n", cfe->type->name); 907 "cfe still linked for %s\n", cfe->type->name);
923 kfree(cfe); 908 kfree(cfe);
909 simple_xattrs_free(&cft->xattrs);
924 } 910 }
925 iput(inode); 911 iput(inode);
926} 912}
@@ -963,12 +949,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
963 return -ENOENT; 949 return -ENOENT;
964} 950}
965 951
966static void cgroup_clear_directory(struct dentry *dir) 952/**
953 * cgroup_clear_directory - selective removal of base and subsystem files
954 * @dir: directory containing the files
955 * @base_files: true if the base files should be removed
956 * @subsys_mask: mask of the subsystem ids whose files should be removed
957 */
958static void cgroup_clear_directory(struct dentry *dir, bool base_files,
959 unsigned long subsys_mask)
967{ 960{
968 struct cgroup *cgrp = __d_cgrp(dir); 961 struct cgroup *cgrp = __d_cgrp(dir);
962 struct cgroup_subsys *ss;
969 963
970 while (!list_empty(&cgrp->files)) 964 for_each_subsys(cgrp->root, ss) {
971 cgroup_rm_file(cgrp, NULL); 965 struct cftype_set *set;
966 if (!test_bit(ss->subsys_id, &subsys_mask))
967 continue;
968 list_for_each_entry(set, &ss->cftsets, node)
969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
970 }
971 if (base_files) {
972 while (!list_empty(&cgrp->files))
973 cgroup_rm_file(cgrp, NULL);
974 }
972} 975}
973 976
974/* 977/*
@@ -977,8 +980,9 @@ static void cgroup_clear_directory(struct dentry *dir)
977static void cgroup_d_remove_dir(struct dentry *dentry) 980static void cgroup_d_remove_dir(struct dentry *dentry)
978{ 981{
979 struct dentry *parent; 982 struct dentry *parent;
983 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
980 984
981 cgroup_clear_directory(dentry); 985 cgroup_clear_directory(dentry, true, root->subsys_mask);
982 986
983 parent = dentry->d_parent; 987 parent = dentry->d_parent;
984 spin_lock(&parent->d_lock); 988 spin_lock(&parent->d_lock);
@@ -990,54 +994,27 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
990} 994}
991 995
992/* 996/*
993 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
994 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
995 * reference to css->refcnt. In general, this refcnt is expected to goes down
996 * to zero, soon.
997 *
998 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
999 */
1000static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1001
1002static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1003{
1004 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1005 wake_up_all(&cgroup_rmdir_waitq);
1006}
1007
1008void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1009{
1010 css_get(css);
1011}
1012
1013void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1014{
1015 cgroup_wakeup_rmdir_waiter(css->cgroup);
1016 css_put(css);
1017}
1018
1019/*
1020 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1021 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1022 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
1023 */ 1000 */
1024static int rebind_subsystems(struct cgroupfs_root *root, 1001static int rebind_subsystems(struct cgroupfs_root *root,
1025 unsigned long final_bits) 1002 unsigned long final_subsys_mask)
1026{ 1003{
1027 unsigned long added_bits, removed_bits; 1004 unsigned long added_mask, removed_mask;
1028 struct cgroup *cgrp = &root->top_cgroup; 1005 struct cgroup *cgrp = &root->top_cgroup;
1029 int i; 1006 int i;
1030 1007
1031 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1008 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1032 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1009 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1033 1010
1034 removed_bits = root->actual_subsys_bits & ~final_bits; 1011 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1035 added_bits = final_bits & ~root->actual_subsys_bits; 1012 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1036 /* Check that any added subsystems are currently free */ 1013 /* Check that any added subsystems are currently free */
1037 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1014 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1038 unsigned long bit = 1UL << i; 1015 unsigned long bit = 1UL << i;
1039 struct cgroup_subsys *ss = subsys[i]; 1016 struct cgroup_subsys *ss = subsys[i];
1040 if (!(bit & added_bits)) 1017 if (!(bit & added_mask))
1041 continue; 1018 continue;
1042 /* 1019 /*
1043 * Nobody should tell us to do a subsys that doesn't exist: 1020 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1062,7 +1039,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1063 struct cgroup_subsys *ss = subsys[i]; 1040 struct cgroup_subsys *ss = subsys[i];
1064 unsigned long bit = 1UL << i; 1041 unsigned long bit = 1UL << i;
1065 if (bit & added_bits) { 1042 if (bit & added_mask) {
1066 /* We're binding this subsystem to this hierarchy */ 1043 /* We're binding this subsystem to this hierarchy */
1067 BUG_ON(ss == NULL); 1044 BUG_ON(ss == NULL);
1068 BUG_ON(cgrp->subsys[i]); 1045 BUG_ON(cgrp->subsys[i]);
@@ -1075,7 +1052,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1075 if (ss->bind) 1052 if (ss->bind)
1076 ss->bind(cgrp); 1053 ss->bind(cgrp);
1077 /* refcount was already taken, and we're keeping it */ 1054 /* refcount was already taken, and we're keeping it */
1078 } else if (bit & removed_bits) { 1055 } else if (bit & removed_mask) {
1079 /* We're removing this subsystem */ 1056 /* We're removing this subsystem */
1080 BUG_ON(ss == NULL); 1057 BUG_ON(ss == NULL);
1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1058 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1088,7 +1065,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1088 list_move(&ss->sibling, &rootnode.subsys_list); 1065 list_move(&ss->sibling, &rootnode.subsys_list);
1089 /* subsystem is now free - drop reference on module */ 1066 /* subsystem is now free - drop reference on module */
1090 module_put(ss->module); 1067 module_put(ss->module);
1091 } else if (bit & final_bits) { 1068 } else if (bit & final_subsys_mask) {
1092 /* Subsystem state should already exist */ 1069 /* Subsystem state should already exist */
1093 BUG_ON(ss == NULL); 1070 BUG_ON(ss == NULL);
1094 BUG_ON(!cgrp->subsys[i]); 1071 BUG_ON(!cgrp->subsys[i]);
@@ -1105,7 +1082,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1105 BUG_ON(cgrp->subsys[i]); 1082 BUG_ON(cgrp->subsys[i]);
1106 } 1083 }
1107 } 1084 }
1108 root->subsys_bits = root->actual_subsys_bits = final_bits; 1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1109 synchronize_rcu(); 1086 synchronize_rcu();
1110 1087
1111 return 0; 1088 return 0;
@@ -1121,9 +1098,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1121 seq_printf(seq, ",%s", ss->name); 1098 seq_printf(seq, ",%s", ss->name);
1122 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1099 if (test_bit(ROOT_NOPREFIX, &root->flags))
1123 seq_puts(seq, ",noprefix"); 1100 seq_puts(seq, ",noprefix");
1101 if (test_bit(ROOT_XATTR, &root->flags))
1102 seq_puts(seq, ",xattr");
1124 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1125 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1126 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1127 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1128 if (strlen(root->name)) 1107 if (strlen(root->name))
1129 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1132,10 +1111,10 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1132} 1111}
1133 1112
1134struct cgroup_sb_opts { 1113struct cgroup_sb_opts {
1135 unsigned long subsys_bits; 1114 unsigned long subsys_mask;
1136 unsigned long flags; 1115 unsigned long flags;
1137 char *release_agent; 1116 char *release_agent;
1138 bool clone_children; 1117 bool cpuset_clone_children;
1139 char *name; 1118 char *name;
1140 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1141 bool none; 1120 bool none;
@@ -1186,7 +1165,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1186 continue; 1165 continue;
1187 } 1166 }
1188 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1189 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1169 continue;
1170 }
1171 if (!strcmp(token, "xattr")) {
1172 set_bit(ROOT_XATTR, &opts->flags);
1190 continue; 1173 continue;
1191 } 1174 }
1192 if (!strncmp(token, "release_agent=", 14)) { 1175 if (!strncmp(token, "release_agent=", 14)) {
@@ -1237,7 +1220,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1237 /* Mutually exclusive option 'all' + subsystem name */ 1220 /* Mutually exclusive option 'all' + subsystem name */
1238 if (all_ss) 1221 if (all_ss)
1239 return -EINVAL; 1222 return -EINVAL;
1240 set_bit(i, &opts->subsys_bits); 1223 set_bit(i, &opts->subsys_mask);
1241 one_ss = true; 1224 one_ss = true;
1242 1225
1243 break; 1226 break;
@@ -1258,7 +1241,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1258 continue; 1241 continue;
1259 if (ss->disabled) 1242 if (ss->disabled)
1260 continue; 1243 continue;
1261 set_bit(i, &opts->subsys_bits); 1244 set_bit(i, &opts->subsys_mask);
1262 } 1245 }
1263 } 1246 }
1264 1247
@@ -1270,19 +1253,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1270 * the cpuset subsystem. 1253 * the cpuset subsystem.
1271 */ 1254 */
1272 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1255 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1273 (opts->subsys_bits & mask)) 1256 (opts->subsys_mask & mask))
1274 return -EINVAL; 1257 return -EINVAL;
1275 1258
1276 1259
1277 /* Can't specify "none" and some subsystems */ 1260 /* Can't specify "none" and some subsystems */
1278 if (opts->subsys_bits && opts->none) 1261 if (opts->subsys_mask && opts->none)
1279 return -EINVAL; 1262 return -EINVAL;
1280 1263
1281 /* 1264 /*
1282 * We either have to specify by name or by subsystems. (So all 1265 * We either have to specify by name or by subsystems. (So all
1283 * empty hierarchies must have a name). 1266 * empty hierarchies must have a name).
1284 */ 1267 */
1285 if (!opts->subsys_bits && !opts->name) 1268 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1269 return -EINVAL;
1287 1270
1288 /* 1271 /*
@@ -1291,10 +1274,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1291 * take duplicate reference counts on a subsystem that's already used, 1274 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case. 1275 * but rebind_subsystems handles this case.
1293 */ 1276 */
1294 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1277 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1295 unsigned long bit = 1UL << i; 1278 unsigned long bit = 1UL << i;
1296 1279
1297 if (!(bit & opts->subsys_bits)) 1280 if (!(bit & opts->subsys_mask))
1298 continue; 1281 continue;
1299 if (!try_module_get(subsys[i]->module)) { 1282 if (!try_module_get(subsys[i]->module)) {
1300 module_pin_failed = true; 1283 module_pin_failed = true;
@@ -1307,11 +1290,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1307 * raced with a module_delete call, and to the user this is 1290 * raced with a module_delete call, and to the user this is
1308 * essentially a "subsystem doesn't exist" case. 1291 * essentially a "subsystem doesn't exist" case.
1309 */ 1292 */
1310 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { 1293 for (i--; i >= 0; i--) {
1311 /* drop refcounts only on the ones we took */ 1294 /* drop refcounts only on the ones we took */
1312 unsigned long bit = 1UL << i; 1295 unsigned long bit = 1UL << i;
1313 1296
1314 if (!(bit & opts->subsys_bits)) 1297 if (!(bit & opts->subsys_mask))
1315 continue; 1298 continue;
1316 module_put(subsys[i]->module); 1299 module_put(subsys[i]->module);
1317 } 1300 }
@@ -1321,13 +1304,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1321 return 0; 1304 return 0;
1322} 1305}
1323 1306
1324static void drop_parsed_module_refcounts(unsigned long subsys_bits) 1307static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1325{ 1308{
1326 int i; 1309 int i;
1327 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1310 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1328 unsigned long bit = 1UL << i; 1311 unsigned long bit = 1UL << i;
1329 1312
1330 if (!(bit & subsys_bits)) 1313 if (!(bit & subsys_mask))
1331 continue; 1314 continue;
1332 module_put(subsys[i]->module); 1315 module_put(subsys[i]->module);
1333 } 1316 }
@@ -1339,6 +1322,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1339 struct cgroupfs_root *root = sb->s_fs_info; 1322 struct cgroupfs_root *root = sb->s_fs_info;
1340 struct cgroup *cgrp = &root->top_cgroup; 1323 struct cgroup *cgrp = &root->top_cgroup;
1341 struct cgroup_sb_opts opts; 1324 struct cgroup_sb_opts opts;
1325 unsigned long added_mask, removed_mask;
1342 1326
1343 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1327 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1344 mutex_lock(&cgroup_mutex); 1328 mutex_lock(&cgroup_mutex);
@@ -1349,28 +1333,38 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1349 if (ret) 1333 if (ret)
1350 goto out_unlock; 1334 goto out_unlock;
1351 1335
1352 /* See feature-removal-schedule.txt */ 1336 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1353 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1354 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1337 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1355 task_tgid_nr(current), current->comm); 1338 task_tgid_nr(current), current->comm);
1356 1339
1340 added_mask = opts.subsys_mask & ~root->subsys_mask;
1341 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1342
1357 /* Don't allow flags or name to change at remount */ 1343 /* Don't allow flags or name to change at remount */
1358 if (opts.flags != root->flags || 1344 if (opts.flags != root->flags ||
1359 (opts.name && strcmp(opts.name, root->name))) { 1345 (opts.name && strcmp(opts.name, root->name))) {
1360 ret = -EINVAL; 1346 ret = -EINVAL;
1361 drop_parsed_module_refcounts(opts.subsys_bits); 1347 drop_parsed_module_refcounts(opts.subsys_mask);
1362 goto out_unlock; 1348 goto out_unlock;
1363 } 1349 }
1364 1350
1365 ret = rebind_subsystems(root, opts.subsys_bits); 1351 /*
1352 * Clear out the files of subsystems that should be removed, do
1353 * this before rebind_subsystems, since rebind_subsystems may
1354 * change this hierarchy's subsys_list.
1355 */
1356 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1357
1358 ret = rebind_subsystems(root, opts.subsys_mask);
1366 if (ret) { 1359 if (ret) {
1367 drop_parsed_module_refcounts(opts.subsys_bits); 1360 /* rebind_subsystems failed, re-populate the removed files */
1361 cgroup_populate_dir(cgrp, false, removed_mask);
1362 drop_parsed_module_refcounts(opts.subsys_mask);
1368 goto out_unlock; 1363 goto out_unlock;
1369 } 1364 }
1370 1365
1371 /* clear out any existing files and repopulate subsystem files */ 1366 /* re-populate subsystem files */
1372 cgroup_clear_directory(cgrp->dentry); 1367 cgroup_populate_dir(cgrp, false, added_mask);
1373 cgroup_populate_dir(cgrp);
1374 1368
1375 if (opts.release_agent) 1369 if (opts.release_agent)
1376 strcpy(root->release_agent_path, opts.release_agent); 1370 strcpy(root->release_agent_path, opts.release_agent);
@@ -1396,11 +1390,13 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1396 INIT_LIST_HEAD(&cgrp->children); 1390 INIT_LIST_HEAD(&cgrp->children);
1397 INIT_LIST_HEAD(&cgrp->files); 1391 INIT_LIST_HEAD(&cgrp->files);
1398 INIT_LIST_HEAD(&cgrp->css_sets); 1392 INIT_LIST_HEAD(&cgrp->css_sets);
1393 INIT_LIST_HEAD(&cgrp->allcg_node);
1399 INIT_LIST_HEAD(&cgrp->release_list); 1394 INIT_LIST_HEAD(&cgrp->release_list);
1400 INIT_LIST_HEAD(&cgrp->pidlists); 1395 INIT_LIST_HEAD(&cgrp->pidlists);
1401 mutex_init(&cgrp->pidlist_mutex); 1396 mutex_init(&cgrp->pidlist_mutex);
1402 INIT_LIST_HEAD(&cgrp->event_list); 1397 INIT_LIST_HEAD(&cgrp->event_list);
1403 spin_lock_init(&cgrp->event_list_lock); 1398 spin_lock_init(&cgrp->event_list_lock);
1399 simple_xattrs_init(&cgrp->xattrs);
1404} 1400}
1405 1401
1406static void init_cgroup_root(struct cgroupfs_root *root) 1402static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1413,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1413 root->number_of_cgroups = 1; 1409 root->number_of_cgroups = 1;
1414 cgrp->root = root; 1410 cgrp->root = root;
1415 cgrp->top_cgroup = cgrp; 1411 cgrp->top_cgroup = cgrp;
1416 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1417 init_cgroup_housekeeping(cgrp); 1412 init_cgroup_housekeeping(cgrp);
1413 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1418} 1414}
1419 1415
1420static bool init_root_id(struct cgroupfs_root *root) 1416static bool init_root_id(struct cgroupfs_root *root)
@@ -1455,8 +1451,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1455 * If we asked for subsystems (or explicitly for no 1451 * If we asked for subsystems (or explicitly for no
1456 * subsystems) then they must match 1452 * subsystems) then they must match
1457 */ 1453 */
1458 if ((opts->subsys_bits || opts->none) 1454 if ((opts->subsys_mask || opts->none)
1459 && (opts->subsys_bits != root->subsys_bits)) 1455 && (opts->subsys_mask != root->subsys_mask))
1460 return 0; 1456 return 0;
1461 1457
1462 return 1; 1458 return 1;
@@ -1466,7 +1462,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1466{ 1462{
1467 struct cgroupfs_root *root; 1463 struct cgroupfs_root *root;
1468 1464
1469 if (!opts->subsys_bits && !opts->none) 1465 if (!opts->subsys_mask && !opts->none)
1470 return NULL; 1466 return NULL;
1471 1467
1472 root = kzalloc(sizeof(*root), GFP_KERNEL); 1468 root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1479,14 +1475,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1479 } 1475 }
1480 init_cgroup_root(root); 1476 init_cgroup_root(root);
1481 1477
1482 root->subsys_bits = opts->subsys_bits; 1478 root->subsys_mask = opts->subsys_mask;
1483 root->flags = opts->flags; 1479 root->flags = opts->flags;
1480 ida_init(&root->cgroup_ida);
1484 if (opts->release_agent) 1481 if (opts->release_agent)
1485 strcpy(root->release_agent_path, opts->release_agent); 1482 strcpy(root->release_agent_path, opts->release_agent);
1486 if (opts->name) 1483 if (opts->name)
1487 strcpy(root->name, opts->name); 1484 strcpy(root->name, opts->name);
1488 if (opts->clone_children) 1485 if (opts->cpuset_clone_children)
1489 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1486 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1490 return root; 1487 return root;
1491} 1488}
1492 1489
@@ -1499,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1499 spin_lock(&hierarchy_id_lock); 1496 spin_lock(&hierarchy_id_lock);
1500 ida_remove(&hierarchy_ida, root->hierarchy_id); 1497 ida_remove(&hierarchy_ida, root->hierarchy_id);
1501 spin_unlock(&hierarchy_id_lock); 1498 spin_unlock(&hierarchy_id_lock);
1499 ida_destroy(&root->cgroup_ida);
1502 kfree(root); 1500 kfree(root);
1503} 1501}
1504 1502
@@ -1511,7 +1509,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1511 if (!opts->new_root) 1509 if (!opts->new_root)
1512 return -EINVAL; 1510 return -EINVAL;
1513 1511
1514 BUG_ON(!opts->subsys_bits && !opts->none); 1512 BUG_ON(!opts->subsys_mask && !opts->none);
1515 1513
1516 ret = set_anon_super(sb, NULL); 1514 ret = set_anon_super(sb, NULL);
1517 if (ret) 1515 if (ret)
@@ -1629,7 +1627,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1629 if (ret) 1627 if (ret)
1630 goto unlock_drop; 1628 goto unlock_drop;
1631 1629
1632 ret = rebind_subsystems(root, root->subsys_bits); 1630 ret = rebind_subsystems(root, root->subsys_mask);
1633 if (ret == -EBUSY) { 1631 if (ret == -EBUSY) {
1634 free_cg_links(&tmp_cg_links); 1632 free_cg_links(&tmp_cg_links);
1635 goto unlock_drop; 1633 goto unlock_drop;
@@ -1664,12 +1662,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 1662
1665 free_cg_links(&tmp_cg_links); 1663 free_cg_links(&tmp_cg_links);
1666 1664
1667 BUG_ON(!list_empty(&root_cgrp->sibling));
1668 BUG_ON(!list_empty(&root_cgrp->children)); 1665 BUG_ON(!list_empty(&root_cgrp->children));
1669 BUG_ON(root->number_of_cgroups != 1); 1666 BUG_ON(root->number_of_cgroups != 1);
1670 1667
1671 cred = override_creds(&init_cred); 1668 cred = override_creds(&init_cred);
1672 cgroup_populate_dir(root_cgrp); 1669 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1673 revert_creds(cred); 1670 revert_creds(cred);
1674 mutex_unlock(&cgroup_root_mutex); 1671 mutex_unlock(&cgroup_root_mutex);
1675 mutex_unlock(&cgroup_mutex); 1672 mutex_unlock(&cgroup_mutex);
@@ -1681,7 +1678,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1681 */ 1678 */
1682 cgroup_drop_root(opts.new_root); 1679 cgroup_drop_root(opts.new_root);
1683 /* no subsys rebinding, so refcounts don't change */ 1680 /* no subsys rebinding, so refcounts don't change */
1684 drop_parsed_module_refcounts(opts.subsys_bits); 1681 drop_parsed_module_refcounts(opts.subsys_mask);
1685 } 1682 }
1686 1683
1687 kfree(opts.release_agent); 1684 kfree(opts.release_agent);
@@ -1695,7 +1692,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1695 drop_new_super: 1692 drop_new_super:
1696 deactivate_locked_super(sb); 1693 deactivate_locked_super(sb);
1697 drop_modules: 1694 drop_modules:
1698 drop_parsed_module_refcounts(opts.subsys_bits); 1695 drop_parsed_module_refcounts(opts.subsys_mask);
1699 out_err: 1696 out_err:
1700 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1701 kfree(opts.name); 1698 kfree(opts.name);
@@ -1713,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1713 1710
1714 BUG_ON(root->number_of_cgroups != 1); 1711 BUG_ON(root->number_of_cgroups != 1);
1715 BUG_ON(!list_empty(&cgrp->children)); 1712 BUG_ON(!list_empty(&cgrp->children));
1716 BUG_ON(!list_empty(&cgrp->sibling));
1717 1713
1718 mutex_lock(&cgroup_mutex); 1714 mutex_lock(&cgroup_mutex);
1719 mutex_lock(&cgroup_root_mutex); 1715 mutex_lock(&cgroup_root_mutex);
@@ -1745,6 +1741,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1745 mutex_unlock(&cgroup_root_mutex); 1741 mutex_unlock(&cgroup_root_mutex);
1746 mutex_unlock(&cgroup_mutex); 1742 mutex_unlock(&cgroup_mutex);
1747 1743
1744 simple_xattrs_free(&cgrp->xattrs);
1745
1748 kill_litter_super(sb); 1746 kill_litter_super(sb);
1749 cgroup_drop_root(root); 1747 cgroup_drop_root(root);
1750} 1748}
@@ -1769,9 +1767,11 @@ static struct kobject *cgroup_kobj;
1769 */ 1767 */
1770int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1768int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1771{ 1769{
1770 struct dentry *dentry = cgrp->dentry;
1772 char *start; 1771 char *start;
1773 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1772
1774 cgroup_lock_is_held()); 1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking");
1775 1775
1776 if (!dentry || cgrp == dummytop) { 1776 if (!dentry || cgrp == dummytop) {
1777 /* 1777 /*
@@ -1782,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1782 return 0; 1782 return 0;
1783 } 1783 }
1784 1784
1785 start = buf + buflen; 1785 start = buf + buflen - 1;
1786 1786
1787 *--start = '\0'; 1787 *start = '\0';
1788 for (;;) { 1788 for (;;) {
1789 int len = dentry->d_name.len; 1789 int len = dentry->d_name.len;
1790 1790
@@ -1795,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1795 if (!cgrp) 1795 if (!cgrp)
1796 break; 1796 break;
1797 1797
1798 dentry = rcu_dereference_check(cgrp->dentry, 1798 dentry = cgrp->dentry;
1799 cgroup_lock_is_held());
1800 if (!cgrp->parent) 1799 if (!cgrp->parent)
1801 continue; 1800 continue;
1802 if (--start < buf) 1801 if (--start < buf)
@@ -1891,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1891/* 1890/*
1892 * cgroup_task_migrate - move a task from one cgroup to another. 1891 * cgroup_task_migrate - move a task from one cgroup to another.
1893 * 1892 *
1894 * 'guarantee' is set if the caller promises that a new css_set for the task 1893 * Must be called with cgroup_mutex and threadgroup locked.
1895 * will already exist. If not set, this function might sleep, and can fail with
1896 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1897 */ 1894 */
1898static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1895static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1899 struct task_struct *tsk, struct css_set *newcg) 1896 struct task_struct *tsk, struct css_set *newcg)
@@ -1923,9 +1920,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1923 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1920 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1924 * it here; it will be freed under RCU. 1921 * it here; it will be freed under RCU.
1925 */ 1922 */
1926 put_css_set(oldcg);
1927
1928 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1923 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1924 put_css_set(oldcg);
1929} 1925}
1930 1926
1931/** 1927/**
@@ -1987,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1987 } 1983 }
1988 1984
1989 synchronize_rcu(); 1985 synchronize_rcu();
1990
1991 /*
1992 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1993 * is no longer empty.
1994 */
1995 cgroup_wakeup_rmdir_waiter(cgrp);
1996out: 1986out:
1997 if (retval) { 1987 if (retval) {
1998 for_each_subsys(root, ss) { 1988 for_each_subsys(root, ss) {
@@ -2162,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2162 * step 5: success! and cleanup 2152 * step 5: success! and cleanup
2163 */ 2153 */
2164 synchronize_rcu(); 2154 synchronize_rcu();
2165 cgroup_wakeup_rmdir_waiter(cgrp);
2166 retval = 0; 2155 retval = 0;
2167out_put_css_set_refs: 2156out_put_css_set_refs:
2168 if (retval) { 2157 if (retval) {
@@ -2551,6 +2540,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2551 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2540 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552} 2541}
2553 2542
2543static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2544{
2545 if (S_ISDIR(dentry->d_inode->i_mode))
2546 return &__d_cgrp(dentry)->xattrs;
2547 else
2548 return &__d_cft(dentry)->xattrs;
2549}
2550
2551static inline int xattr_enabled(struct dentry *dentry)
2552{
2553 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2554 return test_bit(ROOT_XATTR, &root->flags);
2555}
2556
2557static bool is_valid_xattr(const char *name)
2558{
2559 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2560 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2561 return true;
2562 return false;
2563}
2564
2565static int cgroup_setxattr(struct dentry *dentry, const char *name,
2566 const void *val, size_t size, int flags)
2567{
2568 if (!xattr_enabled(dentry))
2569 return -EOPNOTSUPP;
2570 if (!is_valid_xattr(name))
2571 return -EINVAL;
2572 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2573}
2574
2575static int cgroup_removexattr(struct dentry *dentry, const char *name)
2576{
2577 if (!xattr_enabled(dentry))
2578 return -EOPNOTSUPP;
2579 if (!is_valid_xattr(name))
2580 return -EINVAL;
2581 return simple_xattr_remove(__d_xattrs(dentry), name);
2582}
2583
2584static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2585 void *buf, size_t size)
2586{
2587 if (!xattr_enabled(dentry))
2588 return -EOPNOTSUPP;
2589 if (!is_valid_xattr(name))
2590 return -EINVAL;
2591 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2592}
2593
2594static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2595{
2596 if (!xattr_enabled(dentry))
2597 return -EOPNOTSUPP;
2598 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2599}
2600
2554static const struct file_operations cgroup_file_operations = { 2601static const struct file_operations cgroup_file_operations = {
2555 .read = cgroup_file_read, 2602 .read = cgroup_file_read,
2556 .write = cgroup_file_write, 2603 .write = cgroup_file_write,
@@ -2559,11 +2606,22 @@ static const struct file_operations cgroup_file_operations = {
2559 .release = cgroup_file_release, 2606 .release = cgroup_file_release,
2560}; 2607};
2561 2608
2609static const struct inode_operations cgroup_file_inode_operations = {
2610 .setxattr = cgroup_setxattr,
2611 .getxattr = cgroup_getxattr,
2612 .listxattr = cgroup_listxattr,
2613 .removexattr = cgroup_removexattr,
2614};
2615
2562static const struct inode_operations cgroup_dir_inode_operations = { 2616static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = cgroup_lookup, 2617 .lookup = cgroup_lookup,
2564 .mkdir = cgroup_mkdir, 2618 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir, 2619 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename, 2620 .rename = cgroup_rename,
2621 .setxattr = cgroup_setxattr,
2622 .getxattr = cgroup_getxattr,
2623 .listxattr = cgroup_listxattr,
2624 .removexattr = cgroup_removexattr,
2567}; 2625};
2568 2626
2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2627static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2604,45 +2662,27 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2604 2662
2605 /* start off with i_nlink == 2 (for "." entry) */ 2663 /* start off with i_nlink == 2 (for "." entry) */
2606 inc_nlink(inode); 2664 inc_nlink(inode);
2665 inc_nlink(dentry->d_parent->d_inode);
2607 2666
2608 /* start with the directory inode held, so that we can 2667 /*
2609 * populate it without racing with another mkdir */ 2668 * Control reaches here with cgroup_mutex held.
2610 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2669 * @inode->i_mutex should nest outside cgroup_mutex but we
2670 * want to populate it immediately without releasing
2671 * cgroup_mutex. As @inode isn't visible to anyone else
2672 * yet, trylock will always succeed without affecting
2673 * lockdep checks.
2674 */
2675 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2611 } else if (S_ISREG(mode)) { 2676 } else if (S_ISREG(mode)) {
2612 inode->i_size = 0; 2677 inode->i_size = 0;
2613 inode->i_fop = &cgroup_file_operations; 2678 inode->i_fop = &cgroup_file_operations;
2679 inode->i_op = &cgroup_file_inode_operations;
2614 } 2680 }
2615 d_instantiate(dentry, inode); 2681 d_instantiate(dentry, inode);
2616 dget(dentry); /* Extra count - pin the dentry in core */ 2682 dget(dentry); /* Extra count - pin the dentry in core */
2617 return 0; 2683 return 0;
2618} 2684}
2619 2685
2620/*
2621 * cgroup_create_dir - create a directory for an object.
2622 * @cgrp: the cgroup we create the directory for. It must have a valid
2623 * ->parent field. And we are going to fill its ->dentry field.
2624 * @dentry: dentry of the new cgroup
2625 * @mode: mode to set on new directory.
2626 */
2627static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2628 umode_t mode)
2629{
2630 struct dentry *parent;
2631 int error = 0;
2632
2633 parent = cgrp->parent->dentry;
2634 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2635 if (!error) {
2636 dentry->d_fsdata = cgrp;
2637 inc_nlink(parent->d_inode);
2638 rcu_assign_pointer(cgrp->dentry, dentry);
2639 dget(dentry);
2640 }
2641 dput(dentry);
2642
2643 return error;
2644}
2645
2646/** 2686/**
2647 * cgroup_file_mode - deduce file mode of a control file 2687 * cgroup_file_mode - deduce file mode of a control file
2648 * @cft: the control file in question 2688 * @cft: the control file in question
@@ -2671,7 +2711,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2671} 2711}
2672 2712
2673static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2713static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2674 const struct cftype *cft) 2714 struct cftype *cft)
2675{ 2715{
2676 struct dentry *dir = cgrp->dentry; 2716 struct dentry *dir = cgrp->dentry;
2677 struct cgroup *parent = __d_cgrp(dir); 2717 struct cgroup *parent = __d_cgrp(dir);
@@ -2681,11 +2721,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2681 umode_t mode; 2721 umode_t mode;
2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683 2723
2684 /* does @cft->flags tell us to skip creation on @cgrp? */ 2724 simple_xattrs_init(&cft->xattrs);
2685 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2686 return 0;
2687 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2688 return 0;
2689 2725
2690 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2726 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2691 strcpy(name, subsys->name); 2727 strcpy(name, subsys->name);
@@ -2721,12 +2757,18 @@ out:
2721} 2757}
2722 2758
2723static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2759static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2724 const struct cftype cfts[], bool is_add) 2760 struct cftype cfts[], bool is_add)
2725{ 2761{
2726 const struct cftype *cft; 2762 struct cftype *cft;
2727 int err, ret = 0; 2763 int err, ret = 0;
2728 2764
2729 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2765 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2766 /* does cft->flags tell us to skip this file on @cgrp? */
2767 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2768 continue;
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue;
2771
2730 if (is_add) 2772 if (is_add)
2731 err = cgroup_add_file(cgrp, subsys, cft); 2773 err = cgroup_add_file(cgrp, subsys, cft);
2732 else 2774 else
@@ -2757,7 +2799,7 @@ static void cgroup_cfts_prepare(void)
2757} 2799}
2758 2800
2759static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2801static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2760 const struct cftype *cfts, bool is_add) 2802 struct cftype *cfts, bool is_add)
2761 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2803 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2762{ 2804{
2763 LIST_HEAD(pending); 2805 LIST_HEAD(pending);
@@ -2808,7 +2850,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2808 * function currently returns 0 as long as @cfts registration is successful 2850 * function currently returns 0 as long as @cfts registration is successful
2809 * even if some file creation attempts on existing cgroups fail. 2851 * even if some file creation attempts on existing cgroups fail.
2810 */ 2852 */
2811int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2853int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2812{ 2854{
2813 struct cftype_set *set; 2855 struct cftype_set *set;
2814 2856
@@ -2838,7 +2880,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2838 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2880 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2839 * registered with @ss. 2881 * registered with @ss.
2840 */ 2882 */
2841int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2883int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2842{ 2884{
2843 struct cftype_set *set; 2885 struct cftype_set *set;
2844 2886
@@ -2934,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void)
2934 write_unlock(&css_set_lock); 2976 write_unlock(&css_set_lock);
2935} 2977}
2936 2978
2979/**
2980 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2981 * @pos: the current position (%NULL to initiate traversal)
2982 * @cgroup: cgroup whose descendants to walk
2983 *
2984 * To be used by cgroup_for_each_descendant_pre(). Find the next
2985 * descendant to visit for pre-order traversal of @cgroup's descendants.
2986 */
2987struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2988 struct cgroup *cgroup)
2989{
2990 struct cgroup *next;
2991
2992 WARN_ON_ONCE(!rcu_read_lock_held());
2993
2994 /* if first iteration, pretend we just visited @cgroup */
2995 if (!pos) {
2996 if (list_empty(&cgroup->children))
2997 return NULL;
2998 pos = cgroup;
2999 }
3000
3001 /* visit the first child if exists */
3002 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3003 if (next)
3004 return next;
3005
3006 /* no child, visit my or the closest ancestor's next sibling */
3007 do {
3008 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3009 sibling);
3010 if (&next->sibling != &pos->parent->children)
3011 return next;
3012
3013 pos = pos->parent;
3014 } while (pos != cgroup);
3015
3016 return NULL;
3017}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{
3022 struct cgroup *last;
3023
3024 do {
3025 last = pos;
3026 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3027 sibling);
3028 } while (pos);
3029
3030 return last;
3031}
3032
3033/**
3034 * cgroup_next_descendant_post - find the next descendant for post-order walk
3035 * @pos: the current position (%NULL to initiate traversal)
3036 * @cgroup: cgroup whose descendants to walk
3037 *
3038 * To be used by cgroup_for_each_descendant_post(). Find the next
3039 * descendant to visit for post-order traversal of @cgroup's descendants.
3040 */
3041struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3042 struct cgroup *cgroup)
3043{
3044 struct cgroup *next;
3045
3046 WARN_ON_ONCE(!rcu_read_lock_held());
3047
3048 /* if first iteration, visit the leftmost descendant */
3049 if (!pos) {
3050 next = cgroup_leftmost_descendant(cgroup);
3051 return next != cgroup ? next : NULL;
3052 }
3053
3054 /* if there's an unvisited sibling, visit its leftmost descendant */
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return cgroup_leftmost_descendant(next);
3058
3059 /* no sibling left, visit parent */
3060 next = pos->parent;
3061 return next != cgroup ? next : NULL;
3062}
3063EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3064
2937void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3065void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2938 __acquires(css_set_lock) 3066 __acquires(css_set_lock)
2939{ 3067{
@@ -3280,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3280{ 3408{
3281 struct cgroup_pidlist *l; 3409 struct cgroup_pidlist *l;
3282 /* don't need task_nsproxy() if we're looking at ourself */ 3410 /* don't need task_nsproxy() if we're looking at ourself */
3283 struct pid_namespace *ns = current->nsproxy->pid_ns; 3411 struct pid_namespace *ns = task_active_pid_ns(current);
3284 3412
3285 /* 3413 /*
3286 * We can't drop the pidlist_mutex before taking the l->mutex in case 3414 * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3647,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3647 if (flags & POLLHUP) { 3775 if (flags & POLLHUP) {
3648 __remove_wait_queue(event->wqh, &event->wait); 3776 __remove_wait_queue(event->wqh, &event->wait);
3649 spin_lock(&cgrp->event_list_lock); 3777 spin_lock(&cgrp->event_list_lock);
3650 list_del(&event->list); 3778 list_del_init(&event->list);
3651 spin_unlock(&cgrp->event_list_lock); 3779 spin_unlock(&cgrp->event_list_lock);
3652 /* 3780 /*
3653 * We are in atomic context, but cgroup_event_remove() may 3781 * We are in atomic context, but cgroup_event_remove() may
@@ -3784,7 +3912,7 @@ fail:
3784static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3912static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3785 struct cftype *cft) 3913 struct cftype *cft)
3786{ 3914{
3787 return clone_children(cgrp); 3915 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3788} 3916}
3789 3917
3790static int cgroup_clone_children_write(struct cgroup *cgrp, 3918static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3792,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3792 u64 val) 3920 u64 val)
3793{ 3921{
3794 if (val) 3922 if (val)
3795 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3923 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3796 else 3924 else
3797 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3925 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3798 return 0; 3926 return 0;
3799} 3927}
3800 3928
@@ -3843,18 +3971,29 @@ static struct cftype files[] = {
3843 { } /* terminate */ 3971 { } /* terminate */
3844}; 3972};
3845 3973
3846static int cgroup_populate_dir(struct cgroup *cgrp) 3974/**
3975 * cgroup_populate_dir - selectively creation of files in a directory
3976 * @cgrp: target cgroup
3977 * @base_files: true if the base files should be added
3978 * @subsys_mask: mask of the subsystem ids whose files should be added
3979 */
3980static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3981 unsigned long subsys_mask)
3847{ 3982{
3848 int err; 3983 int err;
3849 struct cgroup_subsys *ss; 3984 struct cgroup_subsys *ss;
3850 3985
3851 err = cgroup_addrm_files(cgrp, NULL, files, true); 3986 if (base_files) {
3852 if (err < 0) 3987 err = cgroup_addrm_files(cgrp, NULL, files, true);
3853 return err; 3988 if (err < 0)
3989 return err;
3990 }
3854 3991
3855 /* process cftsets of each subsystem */ 3992 /* process cftsets of each subsystem */
3856 for_each_subsys(cgrp->root, ss) { 3993 for_each_subsys(cgrp->root, ss) {
3857 struct cftype_set *set; 3994 struct cftype_set *set;
3995 if (!test_bit(ss->subsys_id, &subsys_mask))
3996 continue;
3858 3997
3859 list_for_each_entry(set, &ss->cftsets, node) 3998 list_for_each_entry(set, &ss->cftsets, node)
3860 cgroup_addrm_files(cgrp, ss, set->cfts, true); 3999 cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3896,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3896 css->flags = 0; 4035 css->flags = 0;
3897 css->id = NULL; 4036 css->id = NULL;
3898 if (cgrp == dummytop) 4037 if (cgrp == dummytop)
3899 set_bit(CSS_ROOT, &css->flags); 4038 css->flags |= CSS_ROOT;
3900 BUG_ON(cgrp->subsys[ss->subsys_id]); 4039 BUG_ON(cgrp->subsys[ss->subsys_id]);
3901 cgrp->subsys[ss->subsys_id] = css; 4040 cgrp->subsys[ss->subsys_id] = css;
3902 4041
3903 /* 4042 /*
3904 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4043 * css holds an extra ref to @cgrp->dentry which is put on the last
3905 * which is put on the last css_put(). dput() requires process 4044 * css_put(). dput() requires process context, which css_put() may
3906 * context, which css_put() may be called without. @css->dput_work 4045 * be called without. @css->dput_work will be used to invoke
3907 * will be used to invoke dput() asynchronously from css_put(). 4046 * dput() asynchronously from css_put().
3908 */ 4047 */
3909 INIT_WORK(&css->dput_work, css_dput_fn); 4048 INIT_WORK(&css->dput_work, css_dput_fn);
3910 if (ss->__DEPRECATED_clear_css_refs) 4049}
3911 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4050
4051/* invoke ->post_create() on a new CSS and mark it online if successful */
4052static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4053{
4054 int ret = 0;
4055
4056 lockdep_assert_held(&cgroup_mutex);
4057
4058 if (ss->css_online)
4059 ret = ss->css_online(cgrp);
4060 if (!ret)
4061 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4062 return ret;
4063}
4064
4065/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4066static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4067 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4068{
4069 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4070
4071 lockdep_assert_held(&cgroup_mutex);
4072
4073 if (!(css->flags & CSS_ONLINE))
4074 return;
4075
4076 /*
4077 * css_offline() should be called with cgroup_mutex unlocked. See
4078 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4079 * details. This temporary unlocking should go away once
4080 * cgroup_mutex is unexported from controllers.
4081 */
4082 if (ss->css_offline) {
4083 mutex_unlock(&cgroup_mutex);
4084 ss->css_offline(cgrp);
4085 mutex_lock(&cgroup_mutex);
4086 }
4087
4088 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
3912} 4089}
3913 4090
3914/* 4091/*
@@ -3928,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3928 struct cgroup_subsys *ss; 4105 struct cgroup_subsys *ss;
3929 struct super_block *sb = root->sb; 4106 struct super_block *sb = root->sb;
3930 4107
4108 /* allocate the cgroup and its ID, 0 is reserved for the root */
3931 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4109 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3932 if (!cgrp) 4110 if (!cgrp)
3933 return -ENOMEM; 4111 return -ENOMEM;
3934 4112
4113 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4114 if (cgrp->id < 0)
4115 goto err_free_cgrp;
4116
4117 /*
4118 * Only live parents can have children. Note that the liveliness
4119 * check isn't strictly necessary because cgroup_mkdir() and
4120 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4121 * anyway so that locking is contained inside cgroup proper and we
4122 * don't get nasty surprises if we ever grow another caller.
4123 */
4124 if (!cgroup_lock_live_group(parent)) {
4125 err = -ENODEV;
4126 goto err_free_id;
4127 }
4128
3935 /* Grab a reference on the superblock so the hierarchy doesn't 4129 /* Grab a reference on the superblock so the hierarchy doesn't
3936 * get deleted on unmount if there are child cgroups. This 4130 * get deleted on unmount if there are child cgroups. This
3937 * can be done outside cgroup_mutex, since the sb can't 4131 * can be done outside cgroup_mutex, since the sb can't
@@ -3939,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3939 * fs */ 4133 * fs */
3940 atomic_inc(&sb->s_active); 4134 atomic_inc(&sb->s_active);
3941 4135
3942 mutex_lock(&cgroup_mutex);
3943
3944 init_cgroup_housekeeping(cgrp); 4136 init_cgroup_housekeeping(cgrp);
3945 4137
3946 cgrp->parent = parent; 4138 cgrp->parent = parent;
@@ -3950,71 +4142,90 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3950 if (notify_on_release(parent)) 4142 if (notify_on_release(parent))
3951 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4143 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3952 4144
3953 if (clone_children(parent)) 4145 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
3954 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4146 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3955 4147
3956 for_each_subsys(root, ss) { 4148 for_each_subsys(root, ss) {
3957 struct cgroup_subsys_state *css = ss->create(cgrp); 4149 struct cgroup_subsys_state *css;
3958 4150
4151 css = ss->css_alloc(cgrp);
3959 if (IS_ERR(css)) { 4152 if (IS_ERR(css)) {
3960 err = PTR_ERR(css); 4153 err = PTR_ERR(css);
3961 goto err_destroy; 4154 goto err_free_all;
3962 } 4155 }
3963 init_cgroup_css(css, ss, cgrp); 4156 init_cgroup_css(css, ss, cgrp);
3964 if (ss->use_id) { 4157 if (ss->use_id) {
3965 err = alloc_css_id(ss, parent, cgrp); 4158 err = alloc_css_id(ss, parent, cgrp);
3966 if (err) 4159 if (err)
3967 goto err_destroy; 4160 goto err_free_all;
3968 } 4161 }
3969 /* At error, ->destroy() callback has to free assigned ID. */
3970 if (clone_children(parent) && ss->post_clone)
3971 ss->post_clone(cgrp);
3972 } 4162 }
3973 4163
3974 list_add(&cgrp->sibling, &cgrp->parent->children); 4164 /*
3975 root->number_of_cgroups++; 4165 * Create directory. cgroup_create_file() returns with the new
3976 4166 * directory locked on success so that it can be populated without
3977 err = cgroup_create_dir(cgrp, dentry, mode); 4167 * dropping cgroup_mutex.
4168 */
4169 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
3978 if (err < 0) 4170 if (err < 0)
3979 goto err_remove; 4171 goto err_free_all;
4172 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173
4174 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++;
3980 4180
3981 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ 4181 /* each css holds a ref to the cgroup's dentry */
3982 for_each_subsys(root, ss) 4182 for_each_subsys(root, ss)
3983 if (!ss->__DEPRECATED_clear_css_refs) 4183 dget(dentry);
3984 dget(dentry);
3985 4184
3986 /* The cgroup directory was pre-locked for us */ 4185 /* creation succeeded, notify subsystems */
3987 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4186 for_each_subsys(root, ss) {
4187 err = online_css(ss, cgrp);
4188 if (err)
4189 goto err_destroy;
3988 4190
3989 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4191 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4192 parent->parent) {
4193 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4194 current->comm, current->pid, ss->name);
4195 if (!strcmp(ss->name, "memory"))
4196 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4197 ss->warned_broken_hierarchy = true;
4198 }
4199 }
3990 4200
3991 err = cgroup_populate_dir(cgrp); 4201 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
3992 /* If err < 0, we have a half-filled directory - oh well ;) */ 4202 if (err)
4203 goto err_destroy;
3993 4204
3994 mutex_unlock(&cgroup_mutex); 4205 mutex_unlock(&cgroup_mutex);
3995 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4206 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
3996 4207
3997 return 0; 4208 return 0;
3998 4209
3999 err_remove: 4210err_free_all:
4000
4001 list_del(&cgrp->sibling);
4002 root->number_of_cgroups--;
4003
4004 err_destroy:
4005
4006 for_each_subsys(root, ss) { 4211 for_each_subsys(root, ss) {
4007 if (cgrp->subsys[ss->subsys_id]) 4212 if (cgrp->subsys[ss->subsys_id])
4008 ss->destroy(cgrp); 4213 ss->css_free(cgrp);
4009 } 4214 }
4010
4011 mutex_unlock(&cgroup_mutex); 4215 mutex_unlock(&cgroup_mutex);
4012
4013 /* Release the reference count that we took on the superblock */ 4216 /* Release the reference count that we took on the superblock */
4014 deactivate_super(sb); 4217 deactivate_super(sb);
4015 4218err_free_id:
4219 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4220err_free_cgrp:
4016 kfree(cgrp); 4221 kfree(cgrp);
4017 return err; 4222 return err;
4223
4224err_destroy:
4225 cgroup_destroy_locked(cgrp);
4226 mutex_unlock(&cgroup_mutex);
4227 mutex_unlock(&dentry->d_inode->i_mutex);
4228 return err;
4018} 4229}
4019 4230
4020static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4231static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4066,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4066 return 0; 4277 return 0;
4067} 4278}
4068 4279
4069/* 4280static int cgroup_destroy_locked(struct cgroup *cgrp)
4070 * Atomically mark all (or else none) of the cgroup's CSS objects as 4281 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4071 * CSS_REMOVED. Return true on success, or false if the cgroup has
4072 * busy subsystems. Call with cgroup_mutex held
4073 *
4074 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4075 * not, cgroup removal behaves differently.
4076 *
4077 * If clear is set, css refcnt for the subsystem should be zero before
4078 * cgroup removal can be committed. This is implemented by
4079 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4080 * called multiple times until all css refcnts reach zero and is allowed to
4081 * veto removal on any invocation. This behavior is deprecated and will be
4082 * removed as soon as the existing user (memcg) is updated.
4083 *
4084 * If clear is not set, each css holds an extra reference to the cgroup's
4085 * dentry and cgroup removal proceeds regardless of css refs.
4086 * ->pre_destroy() will be called at least once and is not allowed to fail.
4087 * On the last put of each css, whenever that may be, the extra dentry ref
4088 * is put so that dentry destruction happens only after all css's are
4089 * released.
4090 */
4091static int cgroup_clear_css_refs(struct cgroup *cgrp)
4092{ 4282{
4283 struct dentry *d = cgrp->dentry;
4284 struct cgroup *parent = cgrp->parent;
4285 DEFINE_WAIT(wait);
4286 struct cgroup_event *event, *tmp;
4093 struct cgroup_subsys *ss; 4287 struct cgroup_subsys *ss;
4094 unsigned long flags; 4288 LIST_HEAD(tmp_list);
4095 bool failed = false; 4289
4290 lockdep_assert_held(&d->d_inode->i_mutex);
4291 lockdep_assert_held(&cgroup_mutex);
4096 4292
4097 local_irq_save(flags); 4293 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4294 return -EBUSY;
4098 4295
4099 /* 4296 /*
4100 * Block new css_tryget() by deactivating refcnt. If all refcnts 4297 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4101 * for subsystems w/ clear_css_refs set were 1 at the moment of 4298 * removed. This makes future css_tryget() and child creation
4102 * deactivation, we succeeded. 4299 * attempts fail thus maintaining the removal conditions verified
4300 * above.
4103 */ 4301 */
4104 for_each_subsys(cgrp->root, ss) { 4302 for_each_subsys(cgrp->root, ss) {
4105 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4303 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4106 4304
4107 WARN_ON(atomic_read(&css->refcnt) < 0); 4305 WARN_ON(atomic_read(&css->refcnt) < 0);
4108 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4306 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4109
4110 if (ss->__DEPRECATED_clear_css_refs)
4111 failed |= css_refcnt(css) != 1;
4112 } 4307 }
4308 set_bit(CGRP_REMOVED, &cgrp->flags);
4113 4309
4114 /* 4310 /* tell subsystems to initate destruction */
4115 * If succeeded, set REMOVED and put all the base refs; otherwise, 4311 for_each_subsys(cgrp->root, ss)
4116 * restore refcnts to positive values. Either way, all in-progress 4312 offline_css(ss, cgrp);
4117 * css_tryget() will be released.
4118 */
4119 for_each_subsys(cgrp->root, ss) {
4120 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4121
4122 if (!failed) {
4123 set_bit(CSS_REMOVED, &css->flags);
4124 css_put(css);
4125 } else {
4126 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4127 }
4128 }
4129
4130 local_irq_restore(flags);
4131 return !failed;
4132}
4133
4134static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4135{
4136 struct cgroup *cgrp = dentry->d_fsdata;
4137 struct dentry *d;
4138 struct cgroup *parent;
4139 DEFINE_WAIT(wait);
4140 struct cgroup_event *event, *tmp;
4141 int ret;
4142
4143 /* the vfs holds both inode->i_mutex already */
4144again:
4145 mutex_lock(&cgroup_mutex);
4146 if (atomic_read(&cgrp->count) != 0) {
4147 mutex_unlock(&cgroup_mutex);
4148 return -EBUSY;
4149 }
4150 if (!list_empty(&cgrp->children)) {
4151 mutex_unlock(&cgroup_mutex);
4152 return -EBUSY;
4153 }
4154 mutex_unlock(&cgroup_mutex);
4155 4313
4156 /* 4314 /*
4157 * In general, subsystem has no css->refcnt after pre_destroy(). But 4315 * Put all the base refs. Each css holds an extra reference to the
4158 * in racy cases, subsystem may have to get css->refcnt after 4316 * cgroup's dentry and cgroup removal proceeds regardless of css
4159 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes 4317 * refs. On the last put of each css, whenever that may be, the
4160 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue 4318 * extra dentry ref is put so that dentry destruction happens only
4161 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir 4319 * after all css's are released.
4162 * and subsystem's reference count handling. Please see css_get/put
4163 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4164 */ 4320 */
4165 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4321 for_each_subsys(cgrp->root, ss)
4166 4322 css_put(cgrp->subsys[ss->subsys_id]);
4167 /*
4168 * Call pre_destroy handlers of subsys. Notify subsystems
4169 * that rmdir() request comes.
4170 */
4171 ret = cgroup_call_pre_destroy(cgrp);
4172 if (ret) {
4173 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4174 return ret;
4175 }
4176
4177 mutex_lock(&cgroup_mutex);
4178 parent = cgrp->parent;
4179 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4180 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4181 mutex_unlock(&cgroup_mutex);
4182 return -EBUSY;
4183 }
4184 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4185 if (!cgroup_clear_css_refs(cgrp)) {
4186 mutex_unlock(&cgroup_mutex);
4187 /*
4188 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4189 * prepare_to_wait(), we need to check this flag.
4190 */
4191 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4192 schedule();
4193 finish_wait(&cgroup_rmdir_waitq, &wait);
4194 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4195 if (signal_pending(current))
4196 return -EINTR;
4197 goto again;
4198 }
4199 /* NO css_tryget() can success after here. */
4200 finish_wait(&cgroup_rmdir_waitq, &wait);
4201 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4202 4323
4203 raw_spin_lock(&release_list_lock); 4324 raw_spin_lock(&release_list_lock);
4204 set_bit(CGRP_REMOVED, &cgrp->flags);
4205 if (!list_empty(&cgrp->release_list)) 4325 if (!list_empty(&cgrp->release_list))
4206 list_del_init(&cgrp->release_list); 4326 list_del_init(&cgrp->release_list);
4207 raw_spin_unlock(&release_list_lock); 4327 raw_spin_unlock(&release_list_lock);
4208 4328
4209 /* delete this cgroup from parent->children */ 4329 /* delete this cgroup from parent->children */
4210 list_del_init(&cgrp->sibling); 4330 list_del_rcu(&cgrp->sibling);
4211
4212 list_del_init(&cgrp->allcg_node); 4331 list_del_init(&cgrp->allcg_node);
4213 4332
4214 d = dget(cgrp->dentry); 4333 dget(d);
4215
4216 cgroup_d_remove_dir(d); 4334 cgroup_d_remove_dir(d);
4217 dput(d); 4335 dput(d);
4218 4336
@@ -4222,21 +4340,35 @@ again:
4222 /* 4340 /*
4223 * Unregister events and notify userspace. 4341 * Unregister events and notify userspace.
4224 * Notify userspace about cgroup removing only after rmdir of cgroup 4342 * Notify userspace about cgroup removing only after rmdir of cgroup
4225 * directory to avoid race between userspace and kernelspace 4343 * directory to avoid race between userspace and kernelspace. Use
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4226 */ 4347 */
4227 spin_lock(&cgrp->event_list_lock); 4348 spin_lock(&cgrp->event_list_lock);
4228 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4349 list_splice_init(&cgrp->event_list, &tmp_list);
4229 list_del(&event->list); 4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list);
4230 remove_wait_queue(event->wqh, &event->wait); 4353 remove_wait_queue(event->wqh, &event->wait);
4231 eventfd_signal(event->eventfd, 1); 4354 eventfd_signal(event->eventfd, 1);
4232 schedule_work(&event->remove); 4355 schedule_work(&event->remove);
4233 } 4356 }
4234 spin_unlock(&cgrp->event_list_lock);
4235 4357
4236 mutex_unlock(&cgroup_mutex);
4237 return 0; 4358 return 0;
4238} 4359}
4239 4360
4361static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4362{
4363 int ret;
4364
4365 mutex_lock(&cgroup_mutex);
4366 ret = cgroup_destroy_locked(dentry->d_fsdata);
4367 mutex_unlock(&cgroup_mutex);
4368
4369 return ret;
4370}
4371
4240static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4372static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4241{ 4373{
4242 INIT_LIST_HEAD(&ss->cftsets); 4374 INIT_LIST_HEAD(&ss->cftsets);
@@ -4257,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4257 4389
4258 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4390 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4259 4391
4392 mutex_lock(&cgroup_mutex);
4393
4260 /* init base cftset */ 4394 /* init base cftset */
4261 cgroup_init_cftsets(ss); 4395 cgroup_init_cftsets(ss);
4262 4396
4263 /* Create the top cgroup state for this subsystem */ 4397 /* Create the top cgroup state for this subsystem */
4264 list_add(&ss->sibling, &rootnode.subsys_list); 4398 list_add(&ss->sibling, &rootnode.subsys_list);
4265 ss->root = &rootnode; 4399 ss->root = &rootnode;
4266 css = ss->create(dummytop); 4400 css = ss->css_alloc(dummytop);
4267 /* We don't handle early failures gracefully */ 4401 /* We don't handle early failures gracefully */
4268 BUG_ON(IS_ERR(css)); 4402 BUG_ON(IS_ERR(css));
4269 init_cgroup_css(css, ss, dummytop); 4403 init_cgroup_css(css, ss, dummytop);
@@ -4272,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4272 * pointer to this state - since the subsystem is 4406 * pointer to this state - since the subsystem is
4273 * newly registered, all tasks and hence the 4407 * newly registered, all tasks and hence the
4274 * init_css_set is in the subsystem's top cgroup. */ 4408 * init_css_set is in the subsystem's top cgroup. */
4275 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4409 init_css_set.subsys[ss->subsys_id] = css;
4276 4410
4277 need_forkexit_callback |= ss->fork || ss->exit; 4411 need_forkexit_callback |= ss->fork || ss->exit;
4278 4412
@@ -4282,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4282 BUG_ON(!list_empty(&init_task.tasks)); 4416 BUG_ON(!list_empty(&init_task.tasks));
4283 4417
4284 ss->active = 1; 4418 ss->active = 1;
4419 BUG_ON(online_css(ss, dummytop));
4420
4421 mutex_unlock(&cgroup_mutex);
4285 4422
4286 /* this function shouldn't be used with modular subsystems, since they 4423 /* this function shouldn't be used with modular subsystems, since they
4287 * need to register a subsys_id, among other things */ 4424 * need to register a subsys_id, among other things */
@@ -4299,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4299 */ 4436 */
4300int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4437int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4301{ 4438{
4302 int i;
4303 struct cgroup_subsys_state *css; 4439 struct cgroup_subsys_state *css;
4440 int i, ret;
4304 4441
4305 /* check name and function validity */ 4442 /* check name and function validity */
4306 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4307 ss->create == NULL || ss->destroy == NULL) 4444 ss->css_alloc == NULL || ss->css_free == NULL)
4308 return -EINVAL; 4445 return -EINVAL;
4309 4446
4310 /* 4447 /*
@@ -4321,8 +4458,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4321 * since cgroup_init_subsys will have already taken care of it. 4458 * since cgroup_init_subsys will have already taken care of it.
4322 */ 4459 */
4323 if (ss->module == NULL) { 4460 if (ss->module == NULL) {
4324 /* a few sanity checks */ 4461 /* a sanity check */
4325 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4326 BUG_ON(subsys[ss->subsys_id] != ss); 4462 BUG_ON(subsys[ss->subsys_id] != ss);
4327 return 0; 4463 return 0;
4328 } 4464 }
@@ -4330,33 +4466,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4330 /* init base cftset */ 4466 /* init base cftset */
4331 cgroup_init_cftsets(ss); 4467 cgroup_init_cftsets(ss);
4332 4468
4333 /*
4334 * need to register a subsys id before anything else - for example,
4335 * init_cgroup_css needs it.
4336 */
4337 mutex_lock(&cgroup_mutex); 4469 mutex_lock(&cgroup_mutex);
4338 /* find the first empty slot in the array */ 4470 subsys[ss->subsys_id] = ss;
4339 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4340 if (subsys[i] == NULL)
4341 break;
4342 }
4343 if (i == CGROUP_SUBSYS_COUNT) {
4344 /* maximum number of subsystems already registered! */
4345 mutex_unlock(&cgroup_mutex);
4346 return -EBUSY;
4347 }
4348 /* assign ourselves the subsys_id */
4349 ss->subsys_id = i;
4350 subsys[i] = ss;
4351 4471
4352 /* 4472 /*
4353 * no ss->create seems to need anything important in the ss struct, so 4473 * no ss->css_alloc seems to need anything important in the ss
4354 * this can happen first (i.e. before the rootnode attachment). 4474 * struct, so this can happen first (i.e. before the rootnode
4475 * attachment).
4355 */ 4476 */
4356 css = ss->create(dummytop); 4477 css = ss->css_alloc(dummytop);
4357 if (IS_ERR(css)) { 4478 if (IS_ERR(css)) {
4358 /* failure case - need to deassign the subsys[] slot. */ 4479 /* failure case - need to deassign the subsys[] slot. */
4359 subsys[i] = NULL; 4480 subsys[ss->subsys_id] = NULL;
4360 mutex_unlock(&cgroup_mutex); 4481 mutex_unlock(&cgroup_mutex);
4361 return PTR_ERR(css); 4482 return PTR_ERR(css);
4362 } 4483 }
@@ -4368,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4368 init_cgroup_css(css, ss, dummytop); 4489 init_cgroup_css(css, ss, dummytop);
4369 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4490 /* init_idr must be after init_cgroup_css because it sets css->id. */
4370 if (ss->use_id) { 4491 if (ss->use_id) {
4371 int ret = cgroup_init_idr(ss, css); 4492 ret = cgroup_init_idr(ss, css);
4372 if (ret) { 4493 if (ret)
4373 dummytop->subsys[ss->subsys_id] = NULL; 4494 goto err_unload;
4374 ss->destroy(dummytop);
4375 subsys[i] = NULL;
4376 mutex_unlock(&cgroup_mutex);
4377 return ret;
4378 }
4379 } 4495 }
4380 4496
4381 /* 4497 /*
@@ -4408,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4408 write_unlock(&css_set_lock); 4524 write_unlock(&css_set_lock);
4409 4525
4410 ss->active = 1; 4526 ss->active = 1;
4527 ret = online_css(ss, dummytop);
4528 if (ret)
4529 goto err_unload;
4411 4530
4412 /* success! */ 4531 /* success! */
4413 mutex_unlock(&cgroup_mutex); 4532 mutex_unlock(&cgroup_mutex);
4414 return 0; 4533 return 0;
4534
4535err_unload:
4536 mutex_unlock(&cgroup_mutex);
4537 /* @ss can't be mounted here as try_module_get() would fail */
4538 cgroup_unload_subsys(ss);
4539 return ret;
4415} 4540}
4416EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4541EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4417 4542
@@ -4438,8 +4563,16 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4438 BUG_ON(ss->root != &rootnode); 4563 BUG_ON(ss->root != &rootnode);
4439 4564
4440 mutex_lock(&cgroup_mutex); 4565 mutex_lock(&cgroup_mutex);
4566
4567 offline_css(ss, dummytop);
4568 ss->active = 0;
4569
4570 if (ss->use_id) {
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr);
4573 }
4574
4441 /* deassign the subsys_id */ 4575 /* deassign the subsys_id */
4442 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4443 subsys[ss->subsys_id] = NULL; 4576 subsys[ss->subsys_id] = NULL;
4444 4577
4445 /* remove subsystem from rootnode's list of subsystems */ 4578 /* remove subsystem from rootnode's list of subsystems */
@@ -4454,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4454 struct css_set *cg = link->cg; 4587 struct css_set *cg = link->cg;
4455 4588
4456 hlist_del(&cg->hlist); 4589 hlist_del(&cg->hlist);
4457 BUG_ON(!cg->subsys[ss->subsys_id]);
4458 cg->subsys[ss->subsys_id] = NULL; 4590 cg->subsys[ss->subsys_id] = NULL;
4459 hhead = css_set_hash(cg->subsys); 4591 hhead = css_set_hash(cg->subsys);
4460 hlist_add_head(&cg->hlist, hhead); 4592 hlist_add_head(&cg->hlist, hhead);
@@ -4462,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4462 write_unlock(&css_set_lock); 4594 write_unlock(&css_set_lock);
4463 4595
4464 /* 4596 /*
4465 * remove subsystem's css from the dummytop and free it - need to free 4597 * remove subsystem's css from the dummytop and free it - need to
4466 * before marking as null because ss->destroy needs the cgrp->subsys 4598 * free before marking as null because ss->css_free needs the
4467 * pointer to find their state. note that this also takes care of 4599 * cgrp->subsys pointer to find their state. note that this also
4468 * freeing the css_id. 4600 * takes care of freeing the css_id.
4469 */ 4601 */
4470 ss->destroy(dummytop); 4602 ss->css_free(dummytop);
4471 dummytop->subsys[ss->subsys_id] = NULL; 4603 dummytop->subsys[ss->subsys_id] = NULL;
4472 4604
4473 mutex_unlock(&cgroup_mutex); 4605 mutex_unlock(&cgroup_mutex);
@@ -4502,14 +4634,17 @@ int __init cgroup_init_early(void)
4502 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4503 INIT_HLIST_HEAD(&css_set_table[i]); 4635 INIT_HLIST_HEAD(&css_set_table[i]);
4504 4636
4505 /* at bootup time, we don't worry about modular subsystems */ 4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4506 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4507 struct cgroup_subsys *ss = subsys[i]; 4638 struct cgroup_subsys *ss = subsys[i];
4508 4639
4640 /* at bootup time, we don't worry about modular subsystems */
4641 if (!ss || ss->module)
4642 continue;
4643
4509 BUG_ON(!ss->name); 4644 BUG_ON(!ss->name);
4510 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4645 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4511 BUG_ON(!ss->create); 4646 BUG_ON(!ss->css_alloc);
4512 BUG_ON(!ss->destroy); 4647 BUG_ON(!ss->css_free);
4513 if (ss->subsys_id != i) { 4648 if (ss->subsys_id != i) {
4514 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4649 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4515 ss->name, ss->subsys_id); 4650 ss->name, ss->subsys_id);
@@ -4538,9 +4673,12 @@ int __init cgroup_init(void)
4538 if (err) 4673 if (err)
4539 return err; 4674 return err;
4540 4675
4541 /* at bootup time, we don't worry about modular subsystems */ 4676 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4542 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4543 struct cgroup_subsys *ss = subsys[i]; 4677 struct cgroup_subsys *ss = subsys[i];
4678
4679 /* at bootup time, we don't worry about modular subsystems */
4680 if (!ss || ss->module)
4681 continue;
4544 if (!ss->early_init) 4682 if (!ss->early_init)
4545 cgroup_init_subsys(ss); 4683 cgroup_init_subsys(ss);
4546 if (ss->use_id) 4684 if (ss->use_id)
@@ -4695,70 +4833,37 @@ static const struct file_operations proc_cgroupstats_operations = {
4695 * 4833 *
4696 * A pointer to the shared css_set was automatically copied in 4834 * A pointer to the shared css_set was automatically copied in
4697 * fork.c by dup_task_struct(). However, we ignore that copy, since 4835 * fork.c by dup_task_struct(). However, we ignore that copy, since
4698 * it was not made under the protection of RCU, cgroup_mutex or 4836 * it was not made under the protection of RCU or cgroup_mutex, so
4699 * threadgroup_change_begin(), so it might no longer be a valid 4837 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4700 * cgroup pointer. cgroup_attach_task() might have already changed 4838 * have already changed current->cgroups, allowing the previously
4701 * current->cgroups, allowing the previously referenced cgroup 4839 * referenced cgroup group to be removed and freed.
4702 * group to be removed and freed.
4703 *
4704 * Outside the pointer validity we also need to process the css_set
4705 * inheritance between threadgoup_change_begin() and
4706 * threadgoup_change_end(), this way there is no leak in any process
4707 * wide migration performed by cgroup_attach_proc() that could otherwise
4708 * miss a thread because it is too early or too late in the fork stage.
4709 * 4840 *
4710 * At the point that cgroup_fork() is called, 'current' is the parent 4841 * At the point that cgroup_fork() is called, 'current' is the parent
4711 * task, and the passed argument 'child' points to the child task. 4842 * task, and the passed argument 'child' points to the child task.
4712 */ 4843 */
4713void cgroup_fork(struct task_struct *child) 4844void cgroup_fork(struct task_struct *child)
4714{ 4845{
4715 /* 4846 task_lock(current);
4716 * We don't need to task_lock() current because current->cgroups
4717 * can't be changed concurrently here. The parent obviously hasn't
4718 * exited and called cgroup_exit(), and we are synchronized against
4719 * cgroup migration through threadgroup_change_begin().
4720 */
4721 child->cgroups = current->cgroups; 4847 child->cgroups = current->cgroups;
4722 get_css_set(child->cgroups); 4848 get_css_set(child->cgroups);
4849 task_unlock(current);
4723 INIT_LIST_HEAD(&child->cg_list); 4850 INIT_LIST_HEAD(&child->cg_list);
4724} 4851}
4725 4852
4726/** 4853/**
4727 * cgroup_fork_callbacks - run fork callbacks
4728 * @child: the new task
4729 *
4730 * Called on a new task very soon before adding it to the
4731 * tasklist. No need to take any locks since no-one can
4732 * be operating on this task.
4733 */
4734void cgroup_fork_callbacks(struct task_struct *child)
4735{
4736 if (need_forkexit_callback) {
4737 int i;
4738 /*
4739 * forkexit callbacks are only supported for builtin
4740 * subsystems, and the builtin section of the subsys array is
4741 * immutable, so we don't need to lock the subsys array here.
4742 */
4743 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4744 struct cgroup_subsys *ss = subsys[i];
4745 if (ss->fork)
4746 ss->fork(child);
4747 }
4748 }
4749}
4750
4751/**
4752 * cgroup_post_fork - called on a new task after adding it to the task list 4854 * cgroup_post_fork - called on a new task after adding it to the task list
4753 * @child: the task in question 4855 * @child: the task in question
4754 * 4856 *
4755 * Adds the task to the list running through its css_set if necessary. 4857 * Adds the task to the list running through its css_set if necessary and
4756 * Has to be after the task is visible on the task list in case we race 4858 * call the subsystem fork() callbacks. Has to be after the task is
4757 * with the first call to cgroup_iter_start() - to guarantee that the 4859 * visible on the task list in case we race with the first call to
4758 * new task ends up on its list. 4860 * cgroup_iter_start() - to guarantee that the new task ends up on its
4861 * list.
4759 */ 4862 */
4760void cgroup_post_fork(struct task_struct *child) 4863void cgroup_post_fork(struct task_struct *child)
4761{ 4864{
4865 int i;
4866
4762 /* 4867 /*
4763 * use_task_css_set_links is set to 1 before we walk the tasklist 4868 * use_task_css_set_links is set to 1 before we walk the tasklist
4764 * under the tasklist_lock and we read it here after we added the child 4869 * under the tasklist_lock and we read it here after we added the child
@@ -4772,22 +4877,36 @@ void cgroup_post_fork(struct task_struct *child)
4772 */ 4877 */
4773 if (use_task_css_set_links) { 4878 if (use_task_css_set_links) {
4774 write_lock(&css_set_lock); 4879 write_lock(&css_set_lock);
4775 if (list_empty(&child->cg_list)) { 4880 task_lock(child);
4881 if (list_empty(&child->cg_list))
4882 list_add(&child->cg_list, &child->cgroups->tasks);
4883 task_unlock(child);
4884 write_unlock(&css_set_lock);
4885 }
4886
4887 /*
4888 * Call ss->fork(). This must happen after @child is linked on
4889 * css_set; otherwise, @child might change state between ->fork()
4890 * and addition to css_set.
4891 */
4892 if (need_forkexit_callback) {
4893 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4894 struct cgroup_subsys *ss = subsys[i];
4895
4776 /* 4896 /*
4777 * It's safe to use child->cgroups without task_lock() 4897 * fork/exit callbacks are supported only for
4778 * here because we are protected through 4898 * builtin subsystems and we don't need further
4779 * threadgroup_change_begin() against concurrent 4899 * synchronization as they never go away.
4780 * css_set change in cgroup_task_migrate(). Also
4781 * the task can't exit at that point until
4782 * wake_up_new_task() is called, so we are protected
4783 * against cgroup_exit() setting child->cgroup to
4784 * init_css_set.
4785 */ 4900 */
4786 list_add(&child->cg_list, &child->cgroups->tasks); 4901 if (!ss || ss->module)
4902 continue;
4903
4904 if (ss->fork)
4905 ss->fork(child);
4787 } 4906 }
4788 write_unlock(&css_set_lock);
4789 } 4907 }
4790} 4908}
4909
4791/** 4910/**
4792 * cgroup_exit - detach cgroup from exiting task 4911 * cgroup_exit - detach cgroup from exiting task
4793 * @tsk: pointer to task_struct of exiting process 4912 * @tsk: pointer to task_struct of exiting process
@@ -4846,12 +4965,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4846 tsk->cgroups = &init_css_set; 4965 tsk->cgroups = &init_css_set;
4847 4966
4848 if (run_callbacks && need_forkexit_callback) { 4967 if (run_callbacks && need_forkexit_callback) {
4849 /* 4968 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4850 * modular subsystems can't use callbacks, so no need to lock
4851 * the subsys array
4852 */
4853 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4854 struct cgroup_subsys *ss = subsys[i]; 4969 struct cgroup_subsys *ss = subsys[i];
4970
4971 /* modular subsystems can't use callbacks */
4972 if (!ss || ss->module)
4973 continue;
4974
4855 if (ss->exit) { 4975 if (ss->exit) {
4856 struct cgroup *old_cgrp = 4976 struct cgroup *old_cgrp =
4857 rcu_dereference_raw(cg->subsys[i])->cgroup; 4977 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -4919,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp)
4919/* Caller must verify that the css is not for root cgroup */ 5039/* Caller must verify that the css is not for root cgroup */
4920bool __css_tryget(struct cgroup_subsys_state *css) 5040bool __css_tryget(struct cgroup_subsys_state *css)
4921{ 5041{
4922 do { 5042 while (true) {
4923 int v = css_refcnt(css); 5043 int t, v;
4924 5044
4925 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5045 v = css_refcnt(css);
5046 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5047 if (likely(t == v))
4926 return true; 5048 return true;
5049 else if (t < 0)
5050 return false;
4927 cpu_relax(); 5051 cpu_relax();
4928 } while (!test_bit(CSS_REMOVED, &css->flags)); 5052 }
4929
4930 return false;
4931} 5053}
4932EXPORT_SYMBOL_GPL(__css_tryget); 5054EXPORT_SYMBOL_GPL(__css_tryget);
4933 5055
@@ -4946,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css)
4946 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5068 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4947 check_for_release(cgrp); 5069 check_for_release(cgrp);
4948 } 5070 }
4949 cgroup_wakeup_rmdir_waiter(cgrp);
4950 break; 5071 break;
4951 case 0: 5072 case 0:
4952 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5073 schedule_work(&css->dput_work);
4953 schedule_work(&css->dput_work);
4954 break; 5074 break;
4955 } 5075 }
4956 rcu_read_unlock(); 5076 rcu_read_unlock();
@@ -5037,13 +5157,17 @@ static int __init cgroup_disable(char *str)
5037 while ((token = strsep(&str, ",")) != NULL) { 5157 while ((token = strsep(&str, ",")) != NULL) {
5038 if (!*token) 5158 if (!*token)
5039 continue; 5159 continue;
5040 /* 5160 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5041 * cgroup_disable, being at boot time, can't know about module
5042 * subsystems, so we don't worry about them.
5043 */
5044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5045 struct cgroup_subsys *ss = subsys[i]; 5161 struct cgroup_subsys *ss = subsys[i];
5046 5162
5163 /*
5164 * cgroup_disable, being at boot time, can't
5165 * know about module subsystems, so we don't
5166 * worry about them.
5167 */
5168 if (!ss || ss->module)
5169 continue;
5170
5047 if (!strcmp(token, ss->name)) { 5171 if (!strcmp(token, ss->name)) {
5048 ss->disabled = 1; 5172 ss->disabled = 1;
5049 printk(KERN_INFO "Disabling %s control group" 5173 printk(KERN_INFO "Disabling %s control group"
@@ -5332,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5332} 5456}
5333 5457
5334#ifdef CONFIG_CGROUP_DEBUG 5458#ifdef CONFIG_CGROUP_DEBUG
5335static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5459static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5336{ 5460{
5337 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5461 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5338 5462
@@ -5342,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5342 return css; 5466 return css;
5343} 5467}
5344 5468
5345static void debug_destroy(struct cgroup *cont) 5469static void debug_css_free(struct cgroup *cont)
5346{ 5470{
5347 kfree(cont->subsys[debug_subsys_id]); 5471 kfree(cont->subsys[debug_subsys_id]);
5348} 5472}
@@ -5471,8 +5595,8 @@ static struct cftype debug_files[] = {
5471 5595
5472struct cgroup_subsys debug_subsys = { 5596struct cgroup_subsys debug_subsys = {
5473 .name = "debug", 5597 .name = "debug",
5474 .create = debug_create, 5598 .css_alloc = debug_css_alloc,
5475 .destroy = debug_destroy, 5599 .css_free = debug_css_free,
5476 .subsys_id = debug_subsys_id, 5600 .subsys_id = debug_subsys_id,
5477 .base_cftypes = debug_files, 5601 .base_cftypes = debug_files,
5478}; 5602};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 3649fc6b3eaa..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was
260 * only partially frozen when we exitted write. */
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 319
267 seq_puts(m, freezer_state_strs[state]); 320 rcu_read_unlock();
321
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
331 399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,15 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376}; 488};
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..f6150e92dfc9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1215 return 0;
1216} 1216}
1217 1217
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1220 struct compat_timespec __user *interval)
1221{
1222 struct timespec t;
1223 int ret;
1224 mm_segment_t old_fs = get_fs();
1225
1226 set_fs(KERNEL_DS);
1227 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1228 set_fs(old_fs);
1229 if (put_compat_timespec(&t, interval))
1230 return -EFAULT;
1231 return ret;
1232}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234
1218/* 1235/*
1219 * Allocate user-space memory for the duration of a single system call, 1236 * Allocate user-space memory for the duration of a single system call,
1220 * in order to marshall parameters inside a compat thunk. 1237 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..e0e07fd55508
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,83 @@
1#include <linux/context_tracking.h>
2#include <linux/rcupdate.h>
3#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h>
6
7struct context_tracking {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true,
24#endif
25};
26
27void user_enter(void)
28{
29 unsigned long flags;
30
31 /*
32 * Some contexts may involve an exception occuring in an irq,
33 * leading to that nesting:
34 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
35 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
36 * helpers are enough to protect RCU uses inside the exception. So
37 * just return immediately if we detect we are in an IRQ.
38 */
39 if (in_interrupt())
40 return;
41
42 WARN_ON_ONCE(!current->mm);
43
44 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER);
48 rcu_user_enter();
49 }
50 local_irq_restore(flags);
51}
52
53void user_exit(void)
54{
55 unsigned long flags;
56
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt())
66 return;
67
68 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL);
71 rcu_user_exit();
72 }
73 local_irq_restore(flags);
74}
75
76void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next)
78{
79 if (__this_cpu_read(context_tracking.active)) {
80 clear_tsk_thread_flag(prev, TIF_NOHZ);
81 set_tsk_thread_flag(next, TIF_NOHZ);
82 }
83}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14d32588cccd..3046a503242c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
80 if (cpu_hotplug.active_writer == current) 80 if (cpu_hotplug.active_writer == current)
81 return; 81 return;
82 mutex_lock(&cpu_hotplug.lock); 82 mutex_lock(&cpu_hotplug.lock);
83
84 if (WARN_ON(!cpu_hotplug.refcount))
85 cpu_hotplug.refcount++; /* try to fix things up */
86
83 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 87 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
84 wake_up_process(cpu_hotplug.active_writer); 88 wake_up_process(cpu_hotplug.active_writer);
85 mutex_unlock(&cpu_hotplug.lock); 89 mutex_unlock(&cpu_hotplug.lock);
@@ -280,12 +284,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
280 __func__, cpu); 284 __func__, cpu);
281 goto out_release; 285 goto out_release;
282 } 286 }
287 smpboot_park_threads(cpu);
283 288
284 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 289 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
285 if (err) { 290 if (err) {
286 /* CPU didn't die: tell everyone. Can't complain. */ 291 /* CPU didn't die: tell everyone. Can't complain. */
292 smpboot_unpark_threads(cpu);
287 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 293 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
288
289 goto out_release; 294 goto out_release;
290 } 295 }
291 BUG_ON(cpu_online(cpu)); 296 BUG_ON(cpu_online(cpu));
@@ -343,17 +348,23 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
343 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
344 struct task_struct *idle; 349 struct task_struct *idle;
345 350
346 if (cpu_online(cpu) || !cpu_present(cpu))
347 return -EINVAL;
348
349 cpu_hotplug_begin(); 351 cpu_hotplug_begin();
350 352
353 if (cpu_online(cpu) || !cpu_present(cpu)) {
354 ret = -EINVAL;
355 goto out;
356 }
357
351 idle = idle_thread_get(cpu); 358 idle = idle_thread_get(cpu);
352 if (IS_ERR(idle)) { 359 if (IS_ERR(idle)) {
353 ret = PTR_ERR(idle); 360 ret = PTR_ERR(idle);
354 goto out; 361 goto out;
355 } 362 }
356 363
364 ret = smpboot_create_threads(cpu);
365 if (ret)
366 goto out;
367
357 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 368 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
358 if (ret) { 369 if (ret) {
359 nr_calls--; 370 nr_calls--;
@@ -368,6 +379,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
368 goto out_notify; 379 goto out_notify;
369 BUG_ON(!cpu_online(cpu)); 380 BUG_ON(!cpu_online(cpu));
370 381
382 /* Wake the per cpu threads */
383 smpboot_unpark_threads(cpu);
384
371 /* Now call notifier in preparation. */ 385 /* Now call notifier in preparation. */
372 cpu_notify(CPU_ONLINE | mod, hcpu); 386 cpu_notify(CPU_ONLINE | mod, hcpu);
373 387
@@ -439,14 +453,6 @@ EXPORT_SYMBOL_GPL(cpu_up);
439#ifdef CONFIG_PM_SLEEP_SMP 453#ifdef CONFIG_PM_SLEEP_SMP
440static cpumask_var_t frozen_cpus; 454static cpumask_var_t frozen_cpus;
441 455
442void __weak arch_disable_nonboot_cpus_begin(void)
443{
444}
445
446void __weak arch_disable_nonboot_cpus_end(void)
447{
448}
449
450int disable_nonboot_cpus(void) 456int disable_nonboot_cpus(void)
451{ 457{
452 int cpu, first_cpu, error = 0; 458 int cpu, first_cpu, error = 0;
@@ -458,7 +464,6 @@ int disable_nonboot_cpus(void)
458 * with the userspace trying to use the CPU hotplug at the same time 464 * with the userspace trying to use the CPU hotplug at the same time
459 */ 465 */
460 cpumask_clear(frozen_cpus); 466 cpumask_clear(frozen_cpus);
461 arch_disable_nonboot_cpus_begin();
462 467
463 printk("Disabling non-boot CPUs ...\n"); 468 printk("Disabling non-boot CPUs ...\n");
464 for_each_online_cpu(cpu) { 469 for_each_online_cpu(cpu) {
@@ -474,8 +479,6 @@ int disable_nonboot_cpus(void)
474 } 479 }
475 } 480 }
476 481
477 arch_disable_nonboot_cpus_end();
478
479 if (!error) { 482 if (!error) {
480 BUG_ON(num_online_cpus() > 1); 483 BUG_ON(num_online_cpus() > 1);
481 /* Make sure the CPUs won't be enabled by someone else */ 484 /* Make sure the CPUs won't be enabled by someone else */
@@ -600,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
600 603
601static int __init cpu_hotplug_pm_sync_init(void) 604static int __init cpu_hotplug_pm_sync_init(void)
602{ 605{
606 /*
607 * cpu_hotplug_pm_callback has higher priority than x86
608 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
609 * to disable cpu hotplug to avoid cpu hotplug race.
610 */
603 pm_notifier(cpu_hotplug_pm_callback, 0); 611 pm_notifier(cpu_hotplug_pm_callback, 0);
604 return 0; 612 return 0;
605} 613}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..7bb63eea6eb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 302 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 303 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 304 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_HIGH_MEMORY]. 305 * found any online mems, return node_states[N_MEMORY].
306 * 306 *
307 * One way or another, we guarantee to return some non-empty subset 307 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_HIGH_MEMORY]. 308 * of node_states[N_MEMORY].
309 * 309 *
310 * Call with callback_mutex held. 310 * Call with callback_mutex held.
311 */ 311 */
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 314{
315 while (cs && !nodes_intersects(cs->mems_allowed, 315 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY])) 316 node_states[N_MEMORY]))
317 cs = cs->parent; 317 cs = cs->parent;
318 if (cs) 318 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 319 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]); 320 node_states[N_MEMORY]);
321 else 321 else
322 *pmask = node_states[N_HIGH_MEMORY]; 322 *pmask = node_states[N_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 323 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
324} 324}
325 325
326/* 326/*
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1100 return -ENOMEM;
1101 1101
1102 /* 1102 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1103 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1104 * it's read-only 1104 * it's read-only
1105 */ 1105 */
1106 if (cs == &top_cpuset) { 1106 if (cs == &top_cpuset) {
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1122 goto done;
1123 1123
1124 if (!nodes_subset(trialcs->mems_allowed, 1124 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) { 1125 node_states[N_MEMORY])) {
1126 retval = -EINVAL; 1126 retval = -EINVAL;
1127 goto done; 1127 goto done;
1128 } 1128 }
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2034 * before dropping down to the next. It always processes a node before 2026 * before dropping down to the next. It always processes a node before
2035 * any of its children. 2027 * any of its children.
2036 * 2028 *
2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY 2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
2038 * if all present pages from a node are offlined. 2030 * if all present pages from a node are offlined.
2039 */ 2031 */
2040static void 2032static void
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2073 2065
2074 /* Continue past cpusets with all mems online */ 2066 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed, 2067 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY])) 2068 node_states[N_MEMORY]))
2077 continue; 2069 continue;
2078 2070
2079 oldmems = cp->mems_allowed; 2071 oldmems = cp->mems_allowed;
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2081 /* Remove offline mems from this cpuset. */ 2073 /* Remove offline mems from this cpuset. */
2082 mutex_lock(&callback_mutex); 2074 mutex_lock(&callback_mutex);
2083 nodes_and(cp->mems_allowed, cp->mems_allowed, 2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2084 node_states[N_HIGH_MEMORY]); 2076 node_states[N_MEMORY]);
2085 mutex_unlock(&callback_mutex); 2077 mutex_unlock(&callback_mutex);
2086 2078
2087 /* Move tasks from the empty cpuset to a parent */ 2079 /* Move tasks from the empty cpuset to a parent */
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online)
2134 2126
2135#ifdef CONFIG_MEMORY_HOTPLUG 2127#ifdef CONFIG_MEMORY_HOTPLUG
2136/* 2128/*
2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2129 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2130 * Call this routine anytime after node_states[N_MEMORY] changes.
2139 * See cpuset_update_active_cpus() for CPU hotplug handling. 2131 * See cpuset_update_active_cpus() for CPU hotplug handling.
2140 */ 2132 */
2141static int cpuset_track_online_nodes(struct notifier_block *self, 2133static int cpuset_track_online_nodes(struct notifier_block *self,
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2148 case MEM_ONLINE: 2140 case MEM_ONLINE:
2149 oldmems = top_cpuset.mems_allowed; 2141 oldmems = top_cpuset.mems_allowed;
2150 mutex_lock(&callback_mutex); 2142 mutex_lock(&callback_mutex);
2151 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2152 mutex_unlock(&callback_mutex); 2144 mutex_unlock(&callback_mutex);
2153 update_tasks_nodemask(&top_cpuset, &oldmems, NULL); 2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2154 break; 2146 break;
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2177void __init cpuset_init_smp(void) 2169void __init cpuset_init_smp(void)
2178{ 2170{
2179 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2180 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2172 top_cpuset.mems_allowed = node_states[N_MEMORY];
2181 2173
2182 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2183 2175
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void)
2245 * 2237 *
2246 * Description: Returns the nodemask_t mems_allowed of the cpuset 2238 * Description: Returns the nodemask_t mems_allowed of the cpuset
2247 * attached to the specified @tsk. Guaranteed to return some non-empty 2239 * attached to the specified @tsk. Guaranteed to return some non-empty
2248 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2240 * subset of node_states[N_MEMORY], even if this means going outside the
2249 * tasks cpuset. 2241 * tasks cpuset.
2250 **/ 2242 **/
2251 2243
diff --git a/kernel/cred.c b/kernel/cred.c
index de728ac50d82..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
30static struct kmem_cache *cred_jar; 30static struct kmem_cache *cred_jar;
31 31
32/* 32/*
33 * The common credentials for the initial task's thread group
34 */
35#ifdef CONFIG_KEYS
36static struct thread_group_cred init_tgcred = {
37 .usage = ATOMIC_INIT(2),
38 .tgid = 0,
39 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
40};
41#endif
42
43/*
44 * The initial credentials for the initial task 33 * The initial credentials for the initial task
45 */ 34 */
46struct cred init_cred = { 35struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
65 .user = INIT_USER, 54 .user = INIT_USER,
66 .user_ns = &init_user_ns, 55 .user_ns = &init_user_ns,
67 .group_info = &init_groups, 56 .group_info = &init_groups,
68#ifdef CONFIG_KEYS
69 .tgcred = &init_tgcred,
70#endif
71}; 57};
72 58
73static inline void set_cred_subscribers(struct cred *cred, int n) 59static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
96} 82}
97 83
98/* 84/*
99 * Dispose of the shared task group credentials
100 */
101#ifdef CONFIG_KEYS
102static void release_tgcred_rcu(struct rcu_head *rcu)
103{
104 struct thread_group_cred *tgcred =
105 container_of(rcu, struct thread_group_cred, rcu);
106
107 BUG_ON(atomic_read(&tgcred->usage) != 0);
108
109 key_put(tgcred->session_keyring);
110 key_put(tgcred->process_keyring);
111 kfree(tgcred);
112}
113#endif
114
115/*
116 * Release a set of thread group credentials.
117 */
118static void release_tgcred(struct cred *cred)
119{
120#ifdef CONFIG_KEYS
121 struct thread_group_cred *tgcred = cred->tgcred;
122
123 if (atomic_dec_and_test(&tgcred->usage))
124 call_rcu(&tgcred->rcu, release_tgcred_rcu);
125#endif
126}
127
128/*
129 * The RCU callback to actually dispose of a set of credentials 85 * The RCU callback to actually dispose of a set of credentials
130 */ 86 */
131static void put_cred_rcu(struct rcu_head *rcu) 87static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
150#endif 106#endif
151 107
152 security_cred_free(cred); 108 security_cred_free(cred);
109 key_put(cred->session_keyring);
110 key_put(cred->process_keyring);
153 key_put(cred->thread_keyring); 111 key_put(cred->thread_keyring);
154 key_put(cred->request_key_auth); 112 key_put(cred->request_key_auth);
155 release_tgcred(cred);
156 if (cred->group_info) 113 if (cred->group_info)
157 put_group_info(cred->group_info); 114 put_group_info(cred->group_info);
158 free_uid(cred->user); 115 free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
246 if (!new) 203 if (!new)
247 return NULL; 204 return NULL;
248 205
249#ifdef CONFIG_KEYS
250 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
251 if (!new->tgcred) {
252 kmem_cache_free(cred_jar, new);
253 return NULL;
254 }
255 atomic_set(&new->tgcred->usage, 1);
256#endif
257
258 atomic_set(&new->usage, 1); 206 atomic_set(&new->usage, 1);
259#ifdef CONFIG_DEBUG_CREDENTIALS 207#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC; 208 new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
308 get_user_ns(new->user_ns); 256 get_user_ns(new->user_ns);
309 257
310#ifdef CONFIG_KEYS 258#ifdef CONFIG_KEYS
259 key_get(new->session_keyring);
260 key_get(new->process_keyring);
311 key_get(new->thread_keyring); 261 key_get(new->thread_keyring);
312 key_get(new->request_key_auth); 262 key_get(new->request_key_auth);
313 atomic_inc(&new->tgcred->usage);
314#endif 263#endif
315 264
316#ifdef CONFIG_SECURITY 265#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
334 */ 283 */
335struct cred *prepare_exec_creds(void) 284struct cred *prepare_exec_creds(void)
336{ 285{
337 struct thread_group_cred *tgcred = NULL;
338 struct cred *new; 286 struct cred *new;
339 287
340#ifdef CONFIG_KEYS
341 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
342 if (!tgcred)
343 return NULL;
344#endif
345
346 new = prepare_creds(); 288 new = prepare_creds();
347 if (!new) { 289 if (!new)
348 kfree(tgcred);
349 return new; 290 return new;
350 }
351 291
352#ifdef CONFIG_KEYS 292#ifdef CONFIG_KEYS
353 /* newly exec'd tasks don't get a thread keyring */ 293 /* newly exec'd tasks don't get a thread keyring */
354 key_put(new->thread_keyring); 294 key_put(new->thread_keyring);
355 new->thread_keyring = NULL; 295 new->thread_keyring = NULL;
356 296
357 /* create a new per-thread-group creds for all this set of threads to
358 * share */
359 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
360
361 atomic_set(&tgcred->usage, 1);
362 spin_lock_init(&tgcred->lock);
363
364 /* inherit the session keyring; new process keyring */ 297 /* inherit the session keyring; new process keyring */
365 key_get(tgcred->session_keyring); 298 key_put(new->process_keyring);
366 tgcred->process_keyring = NULL; 299 new->process_keyring = NULL;
367
368 release_tgcred(new);
369 new->tgcred = tgcred;
370#endif 300#endif
371 301
372 return new; 302 return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
383 */ 313 */
384int copy_creds(struct task_struct *p, unsigned long clone_flags) 314int copy_creds(struct task_struct *p, unsigned long clone_flags)
385{ 315{
386#ifdef CONFIG_KEYS
387 struct thread_group_cred *tgcred;
388#endif
389 struct cred *new; 316 struct cred *new;
390 int ret; 317 int ret;
391 318
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
425 install_thread_keyring_to_cred(new); 352 install_thread_keyring_to_cred(new);
426 } 353 }
427 354
428 /* we share the process and session keyrings between all the threads in 355 /* The process keyring is only shared between the threads in a process;
429 * a process - this is slightly icky as we violate COW credentials a 356 * anything outside of those threads doesn't inherit.
430 * bit */ 357 */
431 if (!(clone_flags & CLONE_THREAD)) { 358 if (!(clone_flags & CLONE_THREAD)) {
432 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); 359 key_put(new->process_keyring);
433 if (!tgcred) { 360 new->process_keyring = NULL;
434 ret = -ENOMEM;
435 goto error_put;
436 }
437 atomic_set(&tgcred->usage, 1);
438 spin_lock_init(&tgcred->lock);
439 tgcred->process_keyring = NULL;
440 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
441
442 release_tgcred(new);
443 new->tgcred = tgcred;
444 } 361 }
445#endif 362#endif
446 363
@@ -455,6 +372,31 @@ error_put:
455 return ret; 372 return ret;
456} 373}
457 374
375static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
376{
377 const struct user_namespace *set_ns = set->user_ns;
378 const struct user_namespace *subset_ns = subset->user_ns;
379
380 /* If the two credentials are in the same user namespace see if
381 * the capabilities of subset are a subset of set.
382 */
383 if (set_ns == subset_ns)
384 return cap_issubset(subset->cap_permitted, set->cap_permitted);
385
386 /* The credentials are in a different user namespaces
387 * therefore one is a subset of the other only if a set is an
388 * ancestor of subset and set->euid is owner of subset or one
389 * of subsets ancestors.
390 */
391 for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
392 if ((set_ns == subset_ns->parent) &&
393 uid_eq(subset_ns->owner, set->euid))
394 return true;
395 }
396
397 return false;
398}
399
458/** 400/**
459 * commit_creds - Install new credentials upon the current task 401 * commit_creds - Install new credentials upon the current task
460 * @new: The credentials to be assigned 402 * @new: The credentials to be assigned
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new)
493 !gid_eq(old->egid, new->egid) || 435 !gid_eq(old->egid, new->egid) ||
494 !uid_eq(old->fsuid, new->fsuid) || 436 !uid_eq(old->fsuid, new->fsuid) ||
495 !gid_eq(old->fsgid, new->fsgid) || 437 !gid_eq(old->fsgid, new->fsgid) ||
496 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 438 !cred_cap_issubset(old, new)) {
497 if (task->mm) 439 if (task->mm)
498 set_dumpable(task->mm, suid_dumpable); 440 set_dumpable(task->mm, suid_dumpable);
499 task->pdeath_signal = 0; 441 task->pdeath_signal = 0;
@@ -643,9 +585,6 @@ void __init cred_init(void)
643 */ 585 */
644struct cred *prepare_kernel_cred(struct task_struct *daemon) 586struct cred *prepare_kernel_cred(struct task_struct *daemon)
645{ 587{
646#ifdef CONFIG_KEYS
647 struct thread_group_cred *tgcred;
648#endif
649 const struct cred *old; 588 const struct cred *old;
650 struct cred *new; 589 struct cred *new;
651 590
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
653 if (!new) 592 if (!new)
654 return NULL; 593 return NULL;
655 594
656#ifdef CONFIG_KEYS
657 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
658 if (!tgcred) {
659 kmem_cache_free(cred_jar, new);
660 return NULL;
661 }
662#endif
663
664 kdebug("prepare_kernel_cred() alloc %p", new); 595 kdebug("prepare_kernel_cred() alloc %p", new);
665 596
666 if (daemon) 597 if (daemon)
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 get_group_info(new->group_info); 609 get_group_info(new->group_info);
679 610
680#ifdef CONFIG_KEYS 611#ifdef CONFIG_KEYS
681 atomic_set(&tgcred->usage, 1); 612 new->session_keyring = NULL;
682 spin_lock_init(&tgcred->lock); 613 new->process_keyring = NULL;
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
686 new->request_key_auth = NULL;
687 new->thread_keyring = NULL; 614 new->thread_keyring = NULL;
615 new->request_key_auth = NULL;
688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 616 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
689#endif 617#endif
690 618
@@ -799,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
799 atomic_read(&cred->usage), 727 atomic_read(&cred->usage),
800 read_cred_subscribers(cred)); 728 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", 729 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid); 730 from_kuid_munged(&init_user_ns, cred->uid),
731 from_kuid_munged(&init_user_ns, cred->euid),
732 from_kuid_munged(&init_user_ns, cred->suid),
733 from_kuid_munged(&init_user_ns, cred->fsuid));
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", 734 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid); 735 from_kgid_munged(&init_user_ns, cred->gid),
736 from_kgid_munged(&init_user_ns, cred->egid),
737 from_kgid_munged(&init_user_ns, cred->sgid),
738 from_kgid_munged(&init_user_ns, cred->fsgid));
805#ifdef CONFIG_SECURITY 739#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security); 740 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE && 741 if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bca..9a61738cefc8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
672{ 672{
673 struct kgdb_state kgdb_var; 673 struct kgdb_state kgdb_var;
674 struct kgdb_state *ks = &kgdb_var; 674 struct kgdb_state *ks = &kgdb_var;
675 int ret = 0;
676
677 if (arch_kgdb_ops.enable_nmi)
678 arch_kgdb_ops.enable_nmi(0);
675 679
676 ks->cpu = raw_smp_processor_id(); 680 ks->cpu = raw_smp_processor_id();
677 ks->ex_vector = evector; 681 ks->ex_vector = evector;
@@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
681 ks->linux_regs = regs; 685 ks->linux_regs = regs;
682 686
683 if (kgdb_reenter_check(ks)) 687 if (kgdb_reenter_check(ks))
684 return 0; /* Ouch, double exception ! */ 688 goto out; /* Ouch, double exception ! */
685 if (kgdb_info[ks->cpu].enter_kgdb != 0) 689 if (kgdb_info[ks->cpu].enter_kgdb != 0)
686 return 0; 690 goto out;
687 691
688 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); 692 ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
693out:
694 if (arch_kgdb_ops.enable_nmi)
695 arch_kgdb_ops.enable_nmi(1);
696 return ret;
689} 697}
690 698
699/*
700 * GDB places a breakpoint at this function to know dynamically
701 * loaded objects. It's not defined static so that only one instance with this
702 * name exists in the kernel.
703 */
704
705static int module_event(struct notifier_block *self, unsigned long val,
706 void *data)
707{
708 return 0;
709}
710
711static struct notifier_block dbg_module_load_nb = {
712 .notifier_call = module_event,
713};
714
691int kgdb_nmicallback(int cpu, void *regs) 715int kgdb_nmicallback(int cpu, void *regs)
692{ 716{
693#ifdef CONFIG_SMP 717#ifdef CONFIG_SMP
@@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void)
816 kgdb_arch_init(); 840 kgdb_arch_init();
817 if (!dbg_is_early) 841 if (!dbg_is_early)
818 kgdb_arch_late(); 842 kgdb_arch_late();
843 register_module_notifier(&dbg_module_load_nb);
819 register_reboot_notifier(&dbg_reboot_notifier); 844 register_reboot_notifier(&dbg_reboot_notifier);
820 atomic_notifier_chain_register(&panic_notifier_list, 845 atomic_notifier_chain_register(&panic_notifier_list,
821 &kgdb_panic_event_nb); 846 &kgdb_panic_event_nb);
@@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void)
839 if (kgdb_io_module_registered) { 864 if (kgdb_io_module_registered) {
840 kgdb_io_module_registered = 0; 865 kgdb_io_module_registered = 0;
841 unregister_reboot_notifier(&dbg_reboot_notifier); 866 unregister_reboot_notifier(&dbg_reboot_notifier);
867 unregister_module_notifier(&dbg_module_load_nb);
842 atomic_notifier_chain_unregister(&panic_notifier_list, 868 atomic_notifier_chain_unregister(&panic_notifier_list,
843 &kgdb_panic_event_nb); 869 &kgdb_panic_event_nb);
844 kgdb_arch_exit(); 870 kgdb_arch_exit();
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 07c9bbb94a0b..b03e0e814e43 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)
129 } 129 }
130 /* Now the inactive tasks */ 130 /* Now the inactive tasks */
131 kdb_do_each_thread(g, p) { 131 kdb_do_each_thread(g, p) {
132 if (KDB_FLAG(CMD_INTERRUPT))
133 return 0;
132 if (task_curr(p)) 134 if (task_curr(p))
133 continue; 135 continue;
134 if (kdb_bt1(p, mask, argcount, btaprompt)) 136 if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0a69d2adc4f3..14ff4849262c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
552{ 552{
553 int diag; 553 int diag;
554 int linecount; 554 int linecount;
555 int colcount;
555 int logging, saved_loglevel = 0; 556 int logging, saved_loglevel = 0;
556 int saved_trap_printk; 557 int saved_trap_printk;
557 int got_printf_lock = 0; 558 int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
584 if (diag || linecount <= 1) 585 if (diag || linecount <= 1)
585 linecount = 24; 586 linecount = 24;
586 587
588 diag = kdbgetintenv("COLUMNS", &colcount);
589 if (diag || colcount <= 1)
590 colcount = 80;
591
587 diag = kdbgetintenv("LOGGING", &logging); 592 diag = kdbgetintenv("LOGGING", &logging);
588 if (diag) 593 if (diag)
589 logging = 0; 594 logging = 0;
@@ -690,7 +695,7 @@ kdb_printit:
690 gdbstub_msg_write(kdb_buffer, retlen); 695 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 696 } else {
692 if (dbg_io_ops && !dbg_io_ops->is_console) { 697 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 698 len = retlen;
694 cp = kdb_buffer; 699 cp = kdb_buffer;
695 while (len--) { 700 while (len--) {
696 dbg_io_ops->write_char(*cp); 701 dbg_io_ops->write_char(*cp);
@@ -709,11 +714,29 @@ kdb_printit:
709 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
710 } 715 }
711 716
712 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) 717 if (KDB_STATE(PAGER)) {
713 kdb_nextline++; 718 /*
719 * Check printed string to decide how to bump the
720 * kdb_nextline to control when the more prompt should
721 * show up.
722 */
723 int got = 0;
724 len = retlen;
725 while (len--) {
726 if (kdb_buffer[len] == '\n') {
727 kdb_nextline++;
728 got = 0;
729 } else if (kdb_buffer[len] == '\r') {
730 got = 0;
731 } else {
732 got++;
733 }
734 }
735 kdb_nextline += got / (colcount + 1);
736 }
714 737
715 /* check for having reached the LINES number of printed lines */ 738 /* check for having reached the LINES number of printed lines */
716 if (kdb_nextline == linecount) { 739 if (kdb_nextline >= linecount) {
717 char buf1[16] = ""; 740 char buf1[16] = "";
718 741
719 /* Watch out for recursion here. Any routine that calls 742 /* Watch out for recursion here. Any routine that calls
@@ -765,7 +788,7 @@ kdb_printit:
765 kdb_grepping_flag = 0; 788 kdb_grepping_flag = 0;
766 kdb_printf("\n"); 789 kdb_printf("\n");
767 } else if (buf1[0] == ' ') { 790 } else if (buf1[0] == ' ') {
768 kdb_printf("\n"); 791 kdb_printf("\r");
769 suspend_grep = 1; /* for this recursion */ 792 suspend_grep = 1; /* for this recursion */
770 } else if (buf1[0] == '\n') { 793 } else if (buf1[0] == '\n') {
771 kdb_nextline = linecount - 1; 794 kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 31df1706b9a9..4d5f8d5612f3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -21,6 +21,7 @@
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/utsname.h> 22#include <linux/utsname.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/atomic.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
26#include <linux/init.h> 27#include <linux/init.h>
@@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)
2100 } 2101 }
2101 if (!lines--) 2102 if (!lines--)
2102 break; 2103 break;
2104 if (KDB_FLAG(CMD_INTERRUPT))
2105 return 0;
2103 2106
2104 kdb_printf("%.*s\n", (int)len - 1, buf); 2107 kdb_printf("%.*s\n", (int)len - 1, buf);
2105 } 2108 }
@@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv)
2107 return 0; 2110 return 0;
2108} 2111}
2109#endif /* CONFIG_PRINTK */ 2112#endif /* CONFIG_PRINTK */
2113
2114/* Make sure we balance enable/disable calls, must disable first. */
2115static atomic_t kdb_nmi_disabled;
2116
2117static int kdb_disable_nmi(int argc, const char *argv[])
2118{
2119 if (atomic_read(&kdb_nmi_disabled))
2120 return 0;
2121 atomic_set(&kdb_nmi_disabled, 1);
2122 arch_kgdb_ops.enable_nmi(0);
2123 return 0;
2124}
2125
2126static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
2127{
2128 if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
2129 return -EINVAL;
2130 arch_kgdb_ops.enable_nmi(1);
2131 return 0;
2132}
2133
2134static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
2135 .set = kdb_param_enable_nmi,
2136};
2137module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
2138
2110/* 2139/*
2111 * kdb_cpu - This function implements the 'cpu' command. 2140 * kdb_cpu - This function implements the 'cpu' command.
2112 * cpu [<cpunum>] 2141 * cpu [<cpunum>]
@@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void)
2851 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2880 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2852 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2881 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2853#endif 2882#endif
2883 if (arch_kgdb_ops.enable_nmi) {
2884 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
2885 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
2886 }
2854 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2887 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2855 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2888 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2856 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2889 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 98d4597f43d6..c77206184b8b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
159 int rctx; 159 int rctx;
160 struct perf_callchain_entry *entry; 160 struct perf_callchain_entry *entry;
161 161
162 int kernel = !event->attr.exclude_callchain_kernel;
163 int user = !event->attr.exclude_callchain_user;
164
165 if (!kernel && !user)
166 return NULL;
162 167
163 entry = get_callchain_entry(&rctx); 168 entry = get_callchain_entry(&rctx);
164 if (rctx == -1) 169 if (rctx == -1)
@@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
169 174
170 entry->nr = 0; 175 entry->nr = 0;
171 176
172 if (!user_mode(regs)) { 177 if (kernel && !user_mode(regs)) {
173 perf_callchain_store(entry, PERF_CONTEXT_KERNEL); 178 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
174 perf_callchain_kernel(entry, regs); 179 perf_callchain_kernel(entry, regs);
175 if (current->mm)
176 regs = task_pt_regs(current);
177 else
178 regs = NULL;
179 } 180 }
180 181
181 if (regs) { 182 if (user) {
182 /* 183 if (!user_mode(regs)) {
183 * Disallow cross-task user callchains. 184 if (current->mm)
184 */ 185 regs = task_pt_regs(current);
185 if (event->ctx->task && event->ctx->task != current) 186 else
186 goto exit_put; 187 regs = NULL;
187 188 }
188 perf_callchain_store(entry, PERF_CONTEXT_USER); 189
189 perf_callchain_user(entry, regs); 190 if (regs) {
191 /*
192 * Disallow cross-task user callchains.
193 */
194 if (event->ctx->task && event->ctx->task != current)
195 goto exit_put;
196
197 perf_callchain_store(entry, PERF_CONTEXT_USER);
198 perf_callchain_user(entry, regs);
199 }
190 } 200 }
191 201
192exit_put: 202exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7fee567153f0..301079d06f24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
36#include <linux/perf_event.h> 36#include <linux/perf_event.h>
37#include <linux/ftrace_event.h> 37#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 38#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h>
39 40
40#include "internal.h" 41#include "internal.h"
41 42
@@ -371,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
371 372
372 list_for_each_entry_rcu(pmu, &pmus, entry) { 373 list_for_each_entry_rcu(pmu, &pmus, entry) {
373 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 374 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
375 if (cpuctx->unique_pmu != pmu)
376 continue; /* ensure we process each cpuctx once */
374 377
375 /* 378 /*
376 * perf_cgroup_events says at least one 379 * perf_cgroup_events says at least one
@@ -394,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
394 397
395 if (mode & PERF_CGROUP_SWIN) { 398 if (mode & PERF_CGROUP_SWIN) {
396 WARN_ON_ONCE(cpuctx->cgrp); 399 WARN_ON_ONCE(cpuctx->cgrp);
397 /* set cgrp before ctxsw in to 400 /*
398 * allow event_filter_match() to not 401 * set cgrp before ctxsw in to allow
399 * have to pass task around 402 * event_filter_match() to not have to pass
403 * task around
400 */ 404 */
401 cpuctx->cgrp = perf_cgroup_from_task(task); 405 cpuctx->cgrp = perf_cgroup_from_task(task);
402 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 406 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -467,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
467{ 471{
468 struct perf_cgroup *cgrp; 472 struct perf_cgroup *cgrp;
469 struct cgroup_subsys_state *css; 473 struct cgroup_subsys_state *css;
470 struct file *file; 474 struct fd f = fdget(fd);
471 int ret = 0, fput_needed; 475 int ret = 0;
472 476
473 file = fget_light(fd, &fput_needed); 477 if (!f.file)
474 if (!file)
475 return -EBADF; 478 return -EBADF;
476 479
477 css = cgroup_css_from_dir(file, perf_subsys_id); 480 css = cgroup_css_from_dir(f.file, perf_subsys_id);
478 if (IS_ERR(css)) { 481 if (IS_ERR(css)) {
479 ret = PTR_ERR(css); 482 ret = PTR_ERR(css);
480 goto out; 483 goto out;
@@ -500,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
500 ret = -EINVAL; 503 ret = -EINVAL;
501 } 504 }
502out: 505out:
503 fput_light(file, fput_needed); 506 fdput(f);
504 return ret; 507 return ret;
505} 508}
506 509
@@ -3233,21 +3236,18 @@ unlock:
3233 3236
3234static const struct file_operations perf_fops; 3237static const struct file_operations perf_fops;
3235 3238
3236static struct file *perf_fget_light(int fd, int *fput_needed) 3239static inline int perf_fget_light(int fd, struct fd *p)
3237{ 3240{
3238 struct file *file; 3241 struct fd f = fdget(fd);
3239 3242 if (!f.file)
3240 file = fget_light(fd, fput_needed); 3243 return -EBADF;
3241 if (!file)
3242 return ERR_PTR(-EBADF);
3243 3244
3244 if (file->f_op != &perf_fops) { 3245 if (f.file->f_op != &perf_fops) {
3245 fput_light(file, *fput_needed); 3246 fdput(f);
3246 *fput_needed = 0; 3247 return -EBADF;
3247 return ERR_PTR(-EBADF);
3248 } 3248 }
3249 3249 *p = f;
3250 return file; 3250 return 0;
3251} 3251}
3252 3252
3253static int perf_event_set_output(struct perf_event *event, 3253static int perf_event_set_output(struct perf_event *event,
@@ -3279,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3279 3279
3280 case PERF_EVENT_IOC_SET_OUTPUT: 3280 case PERF_EVENT_IOC_SET_OUTPUT:
3281 { 3281 {
3282 struct file *output_file = NULL;
3283 struct perf_event *output_event = NULL;
3284 int fput_needed = 0;
3285 int ret; 3282 int ret;
3286
3287 if (arg != -1) { 3283 if (arg != -1) {
3288 output_file = perf_fget_light(arg, &fput_needed); 3284 struct perf_event *output_event;
3289 if (IS_ERR(output_file)) 3285 struct fd output;
3290 return PTR_ERR(output_file); 3286 ret = perf_fget_light(arg, &output);
3291 output_event = output_file->private_data; 3287 if (ret)
3288 return ret;
3289 output_event = output.file->private_data;
3290 ret = perf_event_set_output(event, output_event);
3291 fdput(output);
3292 } else {
3293 ret = perf_event_set_output(event, NULL);
3292 } 3294 }
3293
3294 ret = perf_event_set_output(event, output_event);
3295 if (output_event)
3296 fput_light(output_file, fput_needed);
3297
3298 return ret; 3295 return ret;
3299 } 3296 }
3300 3297
@@ -3677,7 +3674,7 @@ unlock:
3677 atomic_inc(&event->mmap_count); 3674 atomic_inc(&event->mmap_count);
3678 mutex_unlock(&event->mmap_mutex); 3675 mutex_unlock(&event->mmap_mutex);
3679 3676
3680 vma->vm_flags |= VM_RESERVED; 3677 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3681 vma->vm_ops = &perf_mmap_vmops; 3678 vma->vm_ops = &perf_mmap_vmops;
3682 3679
3683 return ret; 3680 return ret;
@@ -3764,6 +3761,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3764} 3761}
3765EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3762EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3766 3763
3764static void
3765perf_output_sample_regs(struct perf_output_handle *handle,
3766 struct pt_regs *regs, u64 mask)
3767{
3768 int bit;
3769
3770 for_each_set_bit(bit, (const unsigned long *) &mask,
3771 sizeof(mask) * BITS_PER_BYTE) {
3772 u64 val;
3773
3774 val = perf_reg_value(regs, bit);
3775 perf_output_put(handle, val);
3776 }
3777}
3778
3779static void perf_sample_regs_user(struct perf_regs_user *regs_user,
3780 struct pt_regs *regs)
3781{
3782 if (!user_mode(regs)) {
3783 if (current->mm)
3784 regs = task_pt_regs(current);
3785 else
3786 regs = NULL;
3787 }
3788
3789 if (regs) {
3790 regs_user->regs = regs;
3791 regs_user->abi = perf_reg_abi(current);
3792 }
3793}
3794
3795/*
3796 * Get remaining task size from user stack pointer.
3797 *
3798 * It'd be better to take stack vma map and limit this more
3799 * precisly, but there's no way to get it safely under interrupt,
3800 * so using TASK_SIZE as limit.
3801 */
3802static u64 perf_ustack_task_size(struct pt_regs *regs)
3803{
3804 unsigned long addr = perf_user_stack_pointer(regs);
3805
3806 if (!addr || addr >= TASK_SIZE)
3807 return 0;
3808
3809 return TASK_SIZE - addr;
3810}
3811
3812static u16
3813perf_sample_ustack_size(u16 stack_size, u16 header_size,
3814 struct pt_regs *regs)
3815{
3816 u64 task_size;
3817
3818 /* No regs, no stack pointer, no dump. */
3819 if (!regs)
3820 return 0;
3821
3822 /*
3823 * Check if we fit in with the requested stack size into the:
3824 * - TASK_SIZE
3825 * If we don't, we limit the size to the TASK_SIZE.
3826 *
3827 * - remaining sample size
3828 * If we don't, we customize the stack size to
3829 * fit in to the remaining sample size.
3830 */
3831
3832 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
3833 stack_size = min(stack_size, (u16) task_size);
3834
3835 /* Current header size plus static size and dynamic size. */
3836 header_size += 2 * sizeof(u64);
3837
3838 /* Do we fit in with the current stack dump size? */
3839 if ((u16) (header_size + stack_size) < header_size) {
3840 /*
3841 * If we overflow the maximum size for the sample,
3842 * we customize the stack dump size to fit in.
3843 */
3844 stack_size = USHRT_MAX - header_size - sizeof(u64);
3845 stack_size = round_up(stack_size, sizeof(u64));
3846 }
3847
3848 return stack_size;
3849}
3850
3851static void
3852perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
3853 struct pt_regs *regs)
3854{
3855 /* Case of a kernel thread, nothing to dump */
3856 if (!regs) {
3857 u64 size = 0;
3858 perf_output_put(handle, size);
3859 } else {
3860 unsigned long sp;
3861 unsigned int rem;
3862 u64 dyn_size;
3863
3864 /*
3865 * We dump:
3866 * static size
3867 * - the size requested by user or the best one we can fit
3868 * in to the sample max size
3869 * data
3870 * - user stack dump data
3871 * dynamic size
3872 * - the actual dumped size
3873 */
3874
3875 /* Static size. */
3876 perf_output_put(handle, dump_size);
3877
3878 /* Data. */
3879 sp = perf_user_stack_pointer(regs);
3880 rem = __output_copy_user(handle, (void *) sp, dump_size);
3881 dyn_size = dump_size - rem;
3882
3883 perf_output_skip(handle, rem);
3884
3885 /* Dynamic size. */
3886 perf_output_put(handle, dyn_size);
3887 }
3888}
3889
3767static void __perf_event_header__init_id(struct perf_event_header *header, 3890static void __perf_event_header__init_id(struct perf_event_header *header,
3768 struct perf_sample_data *data, 3891 struct perf_sample_data *data,
3769 struct perf_event *event) 3892 struct perf_event *event)
@@ -4024,6 +4147,28 @@ void perf_output_sample(struct perf_output_handle *handle,
4024 perf_output_put(handle, nr); 4147 perf_output_put(handle, nr);
4025 } 4148 }
4026 } 4149 }
4150
4151 if (sample_type & PERF_SAMPLE_REGS_USER) {
4152 u64 abi = data->regs_user.abi;
4153
4154 /*
4155 * If there are no regs to dump, notice it through
4156 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4157 */
4158 perf_output_put(handle, abi);
4159
4160 if (abi) {
4161 u64 mask = event->attr.sample_regs_user;
4162 perf_output_sample_regs(handle,
4163 data->regs_user.regs,
4164 mask);
4165 }
4166 }
4167
4168 if (sample_type & PERF_SAMPLE_STACK_USER)
4169 perf_output_sample_ustack(handle,
4170 data->stack_user_size,
4171 data->regs_user.regs);
4027} 4172}
4028 4173
4029void perf_prepare_sample(struct perf_event_header *header, 4174void perf_prepare_sample(struct perf_event_header *header,
@@ -4075,6 +4220,49 @@ void perf_prepare_sample(struct perf_event_header *header,
4075 } 4220 }
4076 header->size += size; 4221 header->size += size;
4077 } 4222 }
4223
4224 if (sample_type & PERF_SAMPLE_REGS_USER) {
4225 /* regs dump ABI info */
4226 int size = sizeof(u64);
4227
4228 perf_sample_regs_user(&data->regs_user, regs);
4229
4230 if (data->regs_user.regs) {
4231 u64 mask = event->attr.sample_regs_user;
4232 size += hweight64(mask) * sizeof(u64);
4233 }
4234
4235 header->size += size;
4236 }
4237
4238 if (sample_type & PERF_SAMPLE_STACK_USER) {
4239 /*
4240 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
4241 * processed as the last one or have additional check added
4242 * in case new sample type is added, because we could eat
4243 * up the rest of the sample size.
4244 */
4245 struct perf_regs_user *uregs = &data->regs_user;
4246 u16 stack_size = event->attr.sample_stack_user;
4247 u16 size = sizeof(u64);
4248
4249 if (!uregs->abi)
4250 perf_sample_regs_user(uregs, regs);
4251
4252 stack_size = perf_sample_ustack_size(stack_size, header->size,
4253 uregs->regs);
4254
4255 /*
4256 * If there is something to dump, add space for the dump
4257 * itself and for the field that tells the dynamic size,
4258 * which is how many have been actually dumped.
4259 */
4260 if (stack_size)
4261 size += sizeof(u64) + stack_size;
4262
4263 data->stack_user_size = stack_size;
4264 header->size += size;
4265 }
4078} 4266}
4079 4267
4080static void perf_event_output(struct perf_event *event, 4268static void perf_event_output(struct perf_event *event,
@@ -4227,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
4227 rcu_read_lock(); 4415 rcu_read_lock();
4228 list_for_each_entry_rcu(pmu, &pmus, entry) { 4416 list_for_each_entry_rcu(pmu, &pmus, entry) {
4229 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4417 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4230 if (cpuctx->active_pmu != pmu) 4418 if (cpuctx->unique_pmu != pmu)
4231 goto next; 4419 goto next;
4232 perf_event_task_ctx(&cpuctx->ctx, task_event); 4420 perf_event_task_ctx(&cpuctx->ctx, task_event);
4233 4421
@@ -4373,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4373 rcu_read_lock(); 4561 rcu_read_lock();
4374 list_for_each_entry_rcu(pmu, &pmus, entry) { 4562 list_for_each_entry_rcu(pmu, &pmus, entry) {
4375 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4563 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4376 if (cpuctx->active_pmu != pmu) 4564 if (cpuctx->unique_pmu != pmu)
4377 goto next; 4565 goto next;
4378 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4566 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4379 4567
@@ -4569,7 +4757,7 @@ got_name:
4569 rcu_read_lock(); 4757 rcu_read_lock();
4570 list_for_each_entry_rcu(pmu, &pmus, entry) { 4758 list_for_each_entry_rcu(pmu, &pmus, entry) {
4571 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4759 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4572 if (cpuctx->active_pmu != pmu) 4760 if (cpuctx->unique_pmu != pmu)
4573 goto next; 4761 goto next;
4574 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4762 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4575 vma->vm_flags & VM_EXEC); 4763 vma->vm_flags & VM_EXEC);
@@ -5670,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5670 5858
5671 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5859 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5672 5860
5673 if (cpuctx->active_pmu == old_pmu) 5861 if (cpuctx->unique_pmu == old_pmu)
5674 cpuctx->active_pmu = pmu; 5862 cpuctx->unique_pmu = pmu;
5675 } 5863 }
5676} 5864}
5677 5865
@@ -5806,7 +5994,7 @@ skip_type:
5806 cpuctx->ctx.pmu = pmu; 5994 cpuctx->ctx.pmu = pmu;
5807 cpuctx->jiffies_interval = 1; 5995 cpuctx->jiffies_interval = 1;
5808 INIT_LIST_HEAD(&cpuctx->rotation_list); 5996 INIT_LIST_HEAD(&cpuctx->rotation_list);
5809 cpuctx->active_pmu = pmu; 5997 cpuctx->unique_pmu = pmu;
5810 } 5998 }
5811 5999
5812got_cpu_context: 6000got_cpu_context:
@@ -5967,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5967 6155
5968 event->parent = parent_event; 6156 event->parent = parent_event;
5969 6157
5970 event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 event->ns = get_pid_ns(task_active_pid_ns(current));
5971 event->id = atomic64_inc_return(&perf_event_id); 6159 event->id = atomic64_inc_return(&perf_event_id);
5972 6160
5973 event->state = PERF_EVENT_STATE_INACTIVE; 6161 event->state = PERF_EVENT_STATE_INACTIVE;
@@ -6151,6 +6339,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6151 attr->branch_sample_type = mask; 6339 attr->branch_sample_type = mask;
6152 } 6340 }
6153 } 6341 }
6342
6343 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
6344 ret = perf_reg_validate(attr->sample_regs_user);
6345 if (ret)
6346 return ret;
6347 }
6348
6349 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
6350 if (!arch_perf_have_user_stack_dump())
6351 return -ENOSYS;
6352
6353 /*
6354 * We have __u32 type for the size, but so far
6355 * we can only use __u16 as maximum due to the
6356 * __u16 sample size limit.
6357 */
6358 if (attr->sample_stack_user >= USHRT_MAX)
6359 ret = -EINVAL;
6360 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
6361 ret = -EINVAL;
6362 }
6363
6154out: 6364out:
6155 return ret; 6365 return ret;
6156 6366
@@ -6229,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open,
6229 struct perf_event_attr attr; 6439 struct perf_event_attr attr;
6230 struct perf_event_context *ctx; 6440 struct perf_event_context *ctx;
6231 struct file *event_file = NULL; 6441 struct file *event_file = NULL;
6232 struct file *group_file = NULL; 6442 struct fd group = {NULL, 0};
6233 struct task_struct *task = NULL; 6443 struct task_struct *task = NULL;
6234 struct pmu *pmu; 6444 struct pmu *pmu;
6235 int event_fd; 6445 int event_fd;
6236 int move_group = 0; 6446 int move_group = 0;
6237 int fput_needed = 0;
6238 int err; 6447 int err;
6239 6448
6240 /* for future expandability... */ 6449 /* for future expandability... */
@@ -6264,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open,
6264 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 6473 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6265 return -EINVAL; 6474 return -EINVAL;
6266 6475
6267 event_fd = get_unused_fd_flags(O_RDWR); 6476 event_fd = get_unused_fd();
6268 if (event_fd < 0) 6477 if (event_fd < 0)
6269 return event_fd; 6478 return event_fd;
6270 6479
6271 if (group_fd != -1) { 6480 if (group_fd != -1) {
6272 group_file = perf_fget_light(group_fd, &fput_needed); 6481 err = perf_fget_light(group_fd, &group);
6273 if (IS_ERR(group_file)) { 6482 if (err)
6274 err = PTR_ERR(group_file);
6275 goto err_fd; 6483 goto err_fd;
6276 } 6484 group_leader = group.file->private_data;
6277 group_leader = group_file->private_data;
6278 if (flags & PERF_FLAG_FD_OUTPUT) 6485 if (flags & PERF_FLAG_FD_OUTPUT)
6279 output_event = group_leader; 6486 output_event = group_leader;
6280 if (flags & PERF_FLAG_FD_NO_GROUP) 6487 if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6450,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open,
6450 * of the group leader will find the pointer to itself in 6657 * of the group leader will find the pointer to itself in
6451 * perf_group_detach(). 6658 * perf_group_detach().
6452 */ 6659 */
6453 fput_light(group_file, fput_needed); 6660 fdput(group);
6454 fd_install(event_fd, event_file); 6661 fd_install(event_fd, event_file);
6455 return event_fd; 6662 return event_fd;
6456 6663
@@ -6464,7 +6671,7 @@ err_task:
6464 if (task) 6671 if (task)
6465 put_task_struct(task); 6672 put_task_struct(task);
6466err_group_fd: 6673err_group_fd:
6467 fput_light(group_file, fput_needed); 6674 fdput(group);
6468err_fd: 6675err_fd:
6469 put_unused_fd(event_fd); 6676 put_unused_fd(event_fd);
6470 return err; 6677 return err;
@@ -7227,7 +7434,7 @@ unlock:
7227device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7228 7435
7229#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7230static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7231{ 7438{
7232 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7233 7440
@@ -7244,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7244 return &jc->css; 7451 return &jc->css;
7245} 7452}
7246 7453
7247static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7248{ 7455{
7249 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7250 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7285,9 +7492,16 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7285struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7286 .name = "perf_event", 7493 .name = "perf_event",
7287 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7288 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7289 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7290 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7291 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499
7500 /*
7501 * perf_event cgroup doesn't handle nesting correctly.
7502 * ctx->nr_cgroups adjustments should be propagated through the
7503 * cgroup hierarchy. Fix it and remove the following.
7504 */
7505 .broken_hierarchy = true,
7292}; 7506};
7293#endif /* CONFIG_CGROUP_PERF */ 7507#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9a7b487c6fe2..fe8a916507ed 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
111 * Count the number of breakpoints of the same type and same task. 111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list. 112 * The given event must be not on the list.
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct task_struct *tsk = bp->hw.bp_target; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type &&
123 cpu == iter->cpu)
122 count += hw_breakpoint_weight(iter); 124 count += hw_breakpoint_weight(iter);
123 } 125 }
124 126
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
141 if (!tsk) 143 if (!tsk)
142 slots->pinned += max_task_bp_pinned(cpu, type); 144 slots->pinned += max_task_bp_pinned(cpu, type);
143 else 145 else
144 slots->pinned += task_bp_pinned(bp, type); 146 slots->pinned += task_bp_pinned(cpu, bp, type);
145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
146 148
147 return; 149 return;
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
154 if (!tsk) 156 if (!tsk)
155 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
156 else 158 else
157 nr += task_bp_pinned(bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
158 160
159 if (nr > slots->pinned) 161 if (nr > slots->pinned)
160 slots->pinned = nr; 162 slots->pinned = nr;
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
188 int old_idx = 0; 190 int old_idx = 0;
189 int idx = 0; 191 int idx = 0;
190 192
191 old_count = task_bp_pinned(bp, type); 193 old_count = task_bp_pinned(cpu, bp, type);
192 old_idx = old_count - 1; 194 old_idx = old_count - 1;
193 idx = old_idx + weight; 195 idx = old_idx + weight;
194 196
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index a096c19f2c2a..d56a64c99a8b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h> 4#include <linux/hardirq.h>
5#include <linux/uaccess.h>
5 6
6/* Buffer handling */ 7/* Buffer handling */
7 8
@@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 77 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
77} 78}
78 79
79static inline void 80#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
80__output_copy(struct perf_output_handle *handle, 81static inline unsigned int \
81 const void *buf, unsigned int len) 82func_name(struct perf_output_handle *handle, \
83 const void *buf, unsigned int len) \
84{ \
85 unsigned long size, written; \
86 \
87 do { \
88 size = min_t(unsigned long, handle->size, len); \
89 \
90 written = memcpy_func(handle->addr, buf, size); \
91 \
92 len -= written; \
93 handle->addr += written; \
94 buf += written; \
95 handle->size -= written; \
96 if (!handle->size) { \
97 struct ring_buffer *rb = handle->rb; \
98 \
99 handle->page++; \
100 handle->page &= rb->nr_pages - 1; \
101 handle->addr = rb->data_pages[handle->page]; \
102 handle->size = PAGE_SIZE << page_order(rb); \
103 } \
104 } while (len && written == size); \
105 \
106 return len; \
107}
108
109static inline int memcpy_common(void *dst, const void *src, size_t n)
82{ 110{
83 do { 111 memcpy(dst, src, n);
84 unsigned long size = min_t(unsigned long, handle->size, len); 112 return n;
85
86 memcpy(handle->addr, buf, size);
87
88 len -= size;
89 handle->addr += size;
90 buf += size;
91 handle->size -= size;
92 if (!handle->size) {
93 struct ring_buffer *rb = handle->rb;
94
95 handle->page++;
96 handle->page &= rb->nr_pages - 1;
97 handle->addr = rb->data_pages[handle->page];
98 handle->size = PAGE_SIZE << page_order(rb);
99 }
100 } while (len);
101} 113}
102 114
115DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
116
117#define MEMCPY_SKIP(dst, src, n) (n)
118
119DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
120
121#ifndef arch_perf_out_copy_user
122#define arch_perf_out_copy_user __copy_from_user_inatomic
123#endif
124
125DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
126
103/* Callchain handling */ 127/* Callchain handling */
104extern struct perf_callchain_entry * 128extern struct perf_callchain_entry *
105perf_callchain(struct perf_event *event, struct pt_regs *regs); 129perf_callchain(struct perf_event *event, struct pt_regs *regs);
@@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
134 recursion[rctx]--; 158 recursion[rctx]--;
135} 159}
136 160
161#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
162static inline bool arch_perf_have_user_stack_dump(void)
163{
164 return true;
165}
166
167#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
168#else
169static inline bool arch_perf_have_user_stack_dump(void)
170{
171 return false;
172}
173
174#define perf_user_stack_pointer(regs) 0
175#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
176
137#endif /* _KERNEL_EVENTS_INTERNAL_H */ 177#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7a..23cb34ff3973 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
182 return -ENOSPC; 182 return -ENOSPC;
183} 183}
184 184
185void perf_output_copy(struct perf_output_handle *handle, 185unsigned int perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len) 186 const void *buf, unsigned int len)
187{ 187{
188 __output_copy(handle, buf, len); 188 return __output_copy(handle, buf, len);
189}
190
191unsigned int perf_output_skip(struct perf_output_handle *handle,
192 unsigned int len)
193{
194 return __output_skip(handle, NULL, len);
189} 195}
190 196
191void perf_output_end(struct perf_output_handle *handle) 197void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c08a22d02f72..dea7acfbb071 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -33,6 +33,7 @@
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */ 35#include "../../mm/internal.h" /* munlock_vma_page */
36#include <linux/percpu-rwsem.h>
36 37
37#include <linux/uprobes.h> 38#include <linux/uprobes.h>
38 39
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
71static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
72#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
73 74
75static struct percpu_rw_semaphore dup_mmap_sem;
76
74/* 77/*
75 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe 78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
76 * events active at this time. Probably a fine grained per inode count is 79 * events active at this time. Probably a fine grained per inode count is
@@ -78,15 +81,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
78 */ 81 */
79static atomic_t uprobe_events = ATOMIC_INIT(0); 82static atomic_t uprobe_events = ATOMIC_INIT(0);
80 83
84/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2
90
81struct uprobe { 91struct uprobe {
82 struct rb_node rb_node; /* node in the rb tree */ 92 struct rb_node rb_node; /* node in the rb tree */
83 atomic_t ref; 93 atomic_t ref;
84 struct rw_semaphore consumer_rwsem; 94 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
85 struct list_head pending_list; 96 struct list_head pending_list;
86 struct uprobe_consumer *consumers; 97 struct uprobe_consumer *consumers;
87 struct inode *inode; /* Also hold a ref to inode */ 98 struct inode *inode; /* Also hold a ref to inode */
88 loff_t offset; 99 loff_t offset;
89 int flags; 100 unsigned long flags;
90 struct arch_uprobe arch; 101 struct arch_uprobe arch;
91}; 102};
92 103
@@ -100,17 +111,12 @@ struct uprobe {
100 */ 111 */
101static bool valid_vma(struct vm_area_struct *vma, bool is_register) 112static bool valid_vma(struct vm_area_struct *vma, bool is_register)
102{ 113{
103 if (!vma->vm_file) 114 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
104 return false;
105
106 if (!is_register)
107 return true;
108 115
109 if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) 116 if (is_register)
110 == (VM_READ|VM_EXEC)) 117 flags |= VM_WRITE;
111 return true;
112 118
113 return false; 119 return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
114} 120}
115 121
116static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) 122static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
@@ -141,10 +147,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
141 spinlock_t *ptl; 147 spinlock_t *ptl;
142 pte_t *ptep; 148 pte_t *ptep;
143 int err; 149 int err;
150 /* For mmu_notifiers */
151 const unsigned long mmun_start = addr;
152 const unsigned long mmun_end = addr + PAGE_SIZE;
144 153
145 /* For try_to_free_swap() and munlock_vma_page() below */ 154 /* For try_to_free_swap() and munlock_vma_page() below */
146 lock_page(page); 155 lock_page(page);
147 156
157 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
148 err = -EAGAIN; 158 err = -EAGAIN;
149 ptep = page_check_address(page, mm, addr, &ptl, 0); 159 ptep = page_check_address(page, mm, addr, &ptl, 0);
150 if (!ptep) 160 if (!ptep)
@@ -173,6 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
173 183
174 err = 0; 184 err = 0;
175 unlock: 185 unlock:
186 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
176 unlock_page(page); 187 unlock_page(page);
177 return err; 188 return err;
178} 189}
@@ -188,19 +199,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
188 return *insn == UPROBE_SWBP_INSN; 199 return *insn == UPROBE_SWBP_INSN;
189} 200}
190 201
202static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
203{
204 void *kaddr = kmap_atomic(page);
205 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
206 kunmap_atomic(kaddr);
207}
208
209static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
210{
211 uprobe_opcode_t old_opcode;
212 bool is_swbp;
213
214 copy_opcode(page, vaddr, &old_opcode);
215 is_swbp = is_swbp_insn(&old_opcode);
216
217 if (is_swbp_insn(new_opcode)) {
218 if (is_swbp) /* register: already installed? */
219 return 0;
220 } else {
221 if (!is_swbp) /* unregister: was it changed by us? */
222 return 0;
223 }
224
225 return 1;
226}
227
191/* 228/*
192 * NOTE: 229 * NOTE:
193 * Expect the breakpoint instruction to be the smallest size instruction for 230 * Expect the breakpoint instruction to be the smallest size instruction for
194 * the architecture. If an arch has variable length instruction and the 231 * the architecture. If an arch has variable length instruction and the
195 * breakpoint instruction is not of the smallest length instruction 232 * breakpoint instruction is not of the smallest length instruction
196 * supported by that architecture then we need to modify read_opcode / 233 * supported by that architecture then we need to modify is_swbp_at_addr and
197 * write_opcode accordingly. This would never be a problem for archs that 234 * write_opcode accordingly. This would never be a problem for archs that
198 * have fixed length instructions. 235 * have fixed length instructions.
199 */ 236 */
200 237
201/* 238/*
202 * write_opcode - write the opcode at a given virtual address. 239 * write_opcode - write the opcode at a given virtual address.
203 * @auprobe: arch breakpointing information.
204 * @mm: the probed process address space. 240 * @mm: the probed process address space.
205 * @vaddr: the virtual address to store the opcode. 241 * @vaddr: the virtual address to store the opcode.
206 * @opcode: opcode to be written at @vaddr. 242 * @opcode: opcode to be written at @vaddr.
@@ -211,8 +247,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
211 * For mm @mm, write the opcode at @vaddr. 247 * For mm @mm, write the opcode at @vaddr.
212 * Return 0 (success) or a negative errno. 248 * Return 0 (success) or a negative errno.
213 */ 249 */
214static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, 250static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
215 unsigned long vaddr, uprobe_opcode_t opcode) 251 uprobe_opcode_t opcode)
216{ 252{
217 struct page *old_page, *new_page; 253 struct page *old_page, *new_page;
218 void *vaddr_old, *vaddr_new; 254 void *vaddr_old, *vaddr_new;
@@ -221,10 +257,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
221 257
222retry: 258retry:
223 /* Read the page with vaddr into memory */ 259 /* Read the page with vaddr into memory */
224 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 260 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
225 if (ret <= 0) 261 if (ret <= 0)
226 return ret; 262 return ret;
227 263
264 ret = verify_opcode(old_page, vaddr, &opcode);
265 if (ret <= 0)
266 goto put_old;
267
228 ret = -ENOMEM; 268 ret = -ENOMEM;
229 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 269 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
230 if (!new_page) 270 if (!new_page)
@@ -259,65 +299,6 @@ put_old:
259} 299}
260 300
261/** 301/**
262 * read_opcode - read the opcode at a given virtual address.
263 * @mm: the probed process address space.
264 * @vaddr: the virtual address to read the opcode.
265 * @opcode: location to store the read opcode.
266 *
267 * Called with mm->mmap_sem held (for read and with a reference to
268 * mm.
269 *
270 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
271 * Return 0 (success) or a negative errno.
272 */
273static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
274{
275 struct page *page;
276 void *vaddr_new;
277 int ret;
278
279 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
280 if (ret <= 0)
281 return ret;
282
283 lock_page(page);
284 vaddr_new = kmap_atomic(page);
285 vaddr &= ~PAGE_MASK;
286 memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
287 kunmap_atomic(vaddr_new);
288 unlock_page(page);
289
290 put_page(page);
291
292 return 0;
293}
294
295static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
296{
297 uprobe_opcode_t opcode;
298 int result;
299
300 if (current->mm == mm) {
301 pagefault_disable();
302 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
303 sizeof(opcode));
304 pagefault_enable();
305
306 if (likely(result == 0))
307 goto out;
308 }
309
310 result = read_opcode(mm, vaddr, &opcode);
311 if (result)
312 return result;
313out:
314 if (is_swbp_insn(&opcode))
315 return 1;
316
317 return 0;
318}
319
320/**
321 * set_swbp - store breakpoint at a given address. 302 * set_swbp - store breakpoint at a given address.
322 * @auprobe: arch specific probepoint information. 303 * @auprobe: arch specific probepoint information.
323 * @mm: the probed process address space. 304 * @mm: the probed process address space.
@@ -328,18 +309,7 @@ out:
328 */ 309 */
329int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 310int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
330{ 311{
331 int result; 312 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
332 /*
333 * See the comment near uprobes_hash().
334 */
335 result = is_swbp_at_addr(mm, vaddr);
336 if (result == 1)
337 return -EEXIST;
338
339 if (result)
340 return result;
341
342 return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
343} 313}
344 314
345/** 315/**
@@ -347,25 +317,14 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
347 * @mm: the probed process address space. 317 * @mm: the probed process address space.
348 * @auprobe: arch specific probepoint information. 318 * @auprobe: arch specific probepoint information.
349 * @vaddr: the virtual address to insert the opcode. 319 * @vaddr: the virtual address to insert the opcode.
350 * @verify: if true, verify existance of breakpoint instruction.
351 * 320 *
352 * For mm @mm, restore the original opcode (opcode) at @vaddr. 321 * For mm @mm, restore the original opcode (opcode) at @vaddr.
353 * Return 0 (success) or a negative errno. 322 * Return 0 (success) or a negative errno.
354 */ 323 */
355int __weak 324int __weak
356set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) 325set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
357{ 326{
358 if (verify) { 327 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
359 int result;
360
361 result = is_swbp_at_addr(mm, vaddr);
362 if (!result)
363 return -EINVAL;
364
365 if (result != 1)
366 return result;
367 }
368 return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
369} 328}
370 329
371static int match_uprobe(struct uprobe *l, struct uprobe *r) 330static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -415,11 +374,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
415static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) 374static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
416{ 375{
417 struct uprobe *uprobe; 376 struct uprobe *uprobe;
418 unsigned long flags;
419 377
420 spin_lock_irqsave(&uprobes_treelock, flags); 378 spin_lock(&uprobes_treelock);
421 uprobe = __find_uprobe(inode, offset); 379 uprobe = __find_uprobe(inode, offset);
422 spin_unlock_irqrestore(&uprobes_treelock, flags); 380 spin_unlock(&uprobes_treelock);
423 381
424 return uprobe; 382 return uprobe;
425} 383}
@@ -466,15 +424,14 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
466 */ 424 */
467static struct uprobe *insert_uprobe(struct uprobe *uprobe) 425static struct uprobe *insert_uprobe(struct uprobe *uprobe)
468{ 426{
469 unsigned long flags;
470 struct uprobe *u; 427 struct uprobe *u;
471 428
472 spin_lock_irqsave(&uprobes_treelock, flags); 429 spin_lock(&uprobes_treelock);
473 u = __insert_uprobe(uprobe); 430 u = __insert_uprobe(uprobe);
474 spin_unlock_irqrestore(&uprobes_treelock, flags); 431 spin_unlock(&uprobes_treelock);
475 432
476 /* For now assume that the instruction need not be single-stepped */ 433 /* For now assume that the instruction need not be single-stepped */
477 uprobe->flags |= UPROBE_SKIP_SSTEP; 434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
478 435
479 return u; 436 return u;
480} 437}
@@ -496,6 +453,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
496 uprobe->inode = igrab(inode); 453 uprobe->inode = igrab(inode);
497 uprobe->offset = offset; 454 uprobe->offset = offset;
498 init_rwsem(&uprobe->consumer_rwsem); 455 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex);
499 457
500 /* add to uprobes_tree, sorted on inode:offset */ 458 /* add to uprobes_tree, sorted on inode:offset */
501 cur_uprobe = insert_uprobe(uprobe); 459 cur_uprobe = insert_uprobe(uprobe);
@@ -516,7 +474,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
516{ 474{
517 struct uprobe_consumer *uc; 475 struct uprobe_consumer *uc;
518 476
519 if (!(uprobe->flags & UPROBE_RUN_HANDLER)) 477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
520 return; 478 return;
521 479
522 down_read(&uprobe->consumer_rwsem); 480 down_read(&uprobe->consumer_rwsem);
@@ -622,33 +580,48 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
622 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 580 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
623} 581}
624 582
625/* 583static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
626 * How mm->uprobes_state.count gets updated 584 struct mm_struct *mm, unsigned long vaddr)
627 * uprobe_mmap() increments the count if 585{
628 * - it successfully adds a breakpoint. 586 int ret = 0;
629 * - it cannot add a breakpoint, but sees that there is a underlying 587
630 * breakpoint (via a is_swbp_at_addr()). 588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
631 * 589 return ret;
632 * uprobe_munmap() decrements the count if 590
633 * - it sees a underlying breakpoint, (via is_swbp_at_addr) 591 mutex_lock(&uprobe->copy_mutex);
634 * (Subsequent uprobe_unregister wouldnt find the breakpoint 592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
635 * unless a uprobe_mmap kicks in, since the old vma would be 593 goto out;
636 * dropped just after uprobe_munmap.) 594
637 * 595 ret = copy_insn(uprobe, file);
638 * uprobe_register increments the count if: 596 if (ret)
639 * - it successfully adds a breakpoint. 597 goto out;
640 * 598
641 * uprobe_unregister decrements the count if: 599 ret = -ENOTSUPP;
642 * - it sees a underlying breakpoint and removes successfully. 600 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
643 * (via is_swbp_at_addr) 601 goto out;
644 * (Subsequent uprobe_munmap wouldnt find the breakpoint 602
645 * since there is no underlying breakpoint after the 603 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
646 * breakpoint removal.) 604 if (ret)
647 */ 605 goto out;
606
607 /* write_opcode() assumes we don't cross page boundary */
608 BUG_ON((uprobe->offset & ~PAGE_MASK) +
609 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
610
611 smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613
614 out:
615 mutex_unlock(&uprobe->copy_mutex);
616
617 return ret;
618}
619
648static int 620static int
649install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, 621install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
650 struct vm_area_struct *vma, unsigned long vaddr) 622 struct vm_area_struct *vma, unsigned long vaddr)
651{ 623{
624 bool first_uprobe;
652 int ret; 625 int ret;
653 626
654 /* 627 /*
@@ -659,48 +632,38 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
659 * Hence behave as if probe already existed. 632 * Hence behave as if probe already existed.
660 */ 633 */
661 if (!uprobe->consumers) 634 if (!uprobe->consumers)
662 return -EEXIST; 635 return 0;
663
664 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
665 ret = copy_insn(uprobe, vma->vm_file);
666 if (ret)
667 return ret;
668
669 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
670 return -ENOTSUPP;
671
672 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
673 if (ret)
674 return ret;
675
676 /* write_opcode() assumes we don't cross page boundary */
677 BUG_ON((uprobe->offset & ~PAGE_MASK) +
678 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
679 636
680 uprobe->flags |= UPROBE_COPY_INSN; 637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
681 } 638 if (ret)
639 return ret;
682 640
683 /* 641 /*
684 * Ideally, should be updating the probe count after the breakpoint 642 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
685 * has been successfully inserted. However a thread could hit the 643 * the task can hit this breakpoint right after __replace_page().
686 * breakpoint we just inserted even before the probe count is
687 * incremented. If this is the first breakpoint placed, breakpoint
688 * notifier might ignore uprobes and pass the trap to the thread.
689 * Hence increment before and decrement on failure.
690 */ 644 */
691 atomic_inc(&mm->uprobes_state.count); 645 first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
646 if (first_uprobe)
647 set_bit(MMF_HAS_UPROBES, &mm->flags);
648
692 ret = set_swbp(&uprobe->arch, mm, vaddr); 649 ret = set_swbp(&uprobe->arch, mm, vaddr);
693 if (ret) 650 if (!ret)
694 atomic_dec(&mm->uprobes_state.count); 651 clear_bit(MMF_RECALC_UPROBES, &mm->flags);
652 else if (first_uprobe)
653 clear_bit(MMF_HAS_UPROBES, &mm->flags);
695 654
696 return ret; 655 return ret;
697} 656}
698 657
699static void 658static int
700remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
701{ 660{
702 if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) 661 /* can happen if uprobe_register() fails */
703 atomic_dec(&mm->uprobes_state.count); 662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr);
704} 667}
705 668
706/* 669/*
@@ -710,11 +673,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
710 */ 673 */
711static void delete_uprobe(struct uprobe *uprobe) 674static void delete_uprobe(struct uprobe *uprobe)
712{ 675{
713 unsigned long flags; 676 spin_lock(&uprobes_treelock);
714
715 spin_lock_irqsave(&uprobes_treelock, flags);
716 rb_erase(&uprobe->rb_node, &uprobes_tree); 677 rb_erase(&uprobe->rb_node, &uprobes_tree);
717 spin_unlock_irqrestore(&uprobes_treelock, flags); 678 spin_unlock(&uprobes_treelock);
718 iput(uprobe->inode); 679 iput(uprobe->inode);
719 put_uprobe(uprobe); 680 put_uprobe(uprobe);
720 atomic_dec(&uprobe_events); 681 atomic_dec(&uprobe_events);
@@ -737,7 +698,6 @@ static struct map_info *
737build_map_info(struct address_space *mapping, loff_t offset, bool is_register) 698build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
738{ 699{
739 unsigned long pgoff = offset >> PAGE_SHIFT; 700 unsigned long pgoff = offset >> PAGE_SHIFT;
740 struct prio_tree_iter iter;
741 struct vm_area_struct *vma; 701 struct vm_area_struct *vma;
742 struct map_info *curr = NULL; 702 struct map_info *curr = NULL;
743 struct map_info *prev = NULL; 703 struct map_info *prev = NULL;
@@ -746,7 +706,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
746 706
747 again: 707 again:
748 mutex_lock(&mapping->i_mmap_mutex); 708 mutex_lock(&mapping->i_mmap_mutex);
749 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 709 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
750 if (!valid_vma(vma, is_register)) 710 if (!valid_vma(vma, is_register))
751 continue; 711 continue;
752 712
@@ -809,16 +769,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
809 struct map_info *info; 769 struct map_info *info;
810 int err = 0; 770 int err = 0;
811 771
772 percpu_down_write(&dup_mmap_sem);
812 info = build_map_info(uprobe->inode->i_mapping, 773 info = build_map_info(uprobe->inode->i_mapping,
813 uprobe->offset, is_register); 774 uprobe->offset, is_register);
814 if (IS_ERR(info)) 775 if (IS_ERR(info)) {
815 return PTR_ERR(info); 776 err = PTR_ERR(info);
777 goto out;
778 }
816 779
817 while (info) { 780 while (info) {
818 struct mm_struct *mm = info->mm; 781 struct mm_struct *mm = info->mm;
819 struct vm_area_struct *vma; 782 struct vm_area_struct *vma;
820 783
821 if (err) 784 if (err && is_register)
822 goto free; 785 goto free;
823 786
824 down_write(&mm->mmap_sem); 787 down_write(&mm->mmap_sem);
@@ -831,24 +794,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
831 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
832 goto unlock; 795 goto unlock;
833 796
834 if (is_register) { 797 if (is_register)
835 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 798 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
836 /* 799 else
837 * We can race against uprobe_mmap(), see the 800 err |= remove_breakpoint(uprobe, mm, info->vaddr);
838 * comment near uprobe_hash(). 801
839 */
840 if (err == -EEXIST)
841 err = 0;
842 } else {
843 remove_breakpoint(uprobe, mm, info->vaddr);
844 }
845 unlock: 802 unlock:
846 up_write(&mm->mmap_sem); 803 up_write(&mm->mmap_sem);
847 free: 804 free:
848 mmput(mm); 805 mmput(mm);
849 info = free_map_info(info); 806 info = free_map_info(info);
850 } 807 }
851 808 out:
809 percpu_up_write(&dup_mmap_sem);
852 return err; 810 return err;
853} 811}
854 812
@@ -897,18 +855,21 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
897 mutex_lock(uprobes_hash(inode)); 855 mutex_lock(uprobes_hash(inode));
898 uprobe = alloc_uprobe(inode, offset); 856 uprobe = alloc_uprobe(inode, offset);
899 857
900 if (uprobe && !consumer_add(uprobe, uc)) { 858 if (!uprobe) {
859 ret = -ENOMEM;
860 } else if (!consumer_add(uprobe, uc)) {
901 ret = __uprobe_register(uprobe); 861 ret = __uprobe_register(uprobe);
902 if (ret) { 862 if (ret) {
903 uprobe->consumers = NULL; 863 uprobe->consumers = NULL;
904 __uprobe_unregister(uprobe); 864 __uprobe_unregister(uprobe);
905 } else { 865 } else {
906 uprobe->flags |= UPROBE_RUN_HANDLER; 866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
907 } 867 }
908 } 868 }
909 869
910 mutex_unlock(uprobes_hash(inode)); 870 mutex_unlock(uprobes_hash(inode));
911 put_uprobe(uprobe); 871 if (uprobe)
872 put_uprobe(uprobe);
912 873
913 return ret; 874 return ret;
914} 875}
@@ -935,7 +896,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
935 if (consumer_del(uprobe, uc)) { 896 if (consumer_del(uprobe, uc)) {
936 if (!uprobe->consumers) { 897 if (!uprobe->consumers) {
937 __uprobe_unregister(uprobe); 898 __uprobe_unregister(uprobe);
938 uprobe->flags &= ~UPROBE_RUN_HANDLER; 899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
939 } 900 }
940 } 901 }
941 902
@@ -978,7 +939,6 @@ static void build_probe_list(struct inode *inode,
978 struct list_head *head) 939 struct list_head *head)
979{ 940{
980 loff_t min, max; 941 loff_t min, max;
981 unsigned long flags;
982 struct rb_node *n, *t; 942 struct rb_node *n, *t;
983 struct uprobe *u; 943 struct uprobe *u;
984 944
@@ -986,7 +946,7 @@ static void build_probe_list(struct inode *inode,
986 min = vaddr_to_offset(vma, start); 946 min = vaddr_to_offset(vma, start);
987 max = min + (end - start) - 1; 947 max = min + (end - start) - 1;
988 948
989 spin_lock_irqsave(&uprobes_treelock, flags); 949 spin_lock(&uprobes_treelock);
990 n = find_node_in_range(inode, min, max); 950 n = find_node_in_range(inode, min, max);
991 if (n) { 951 if (n) {
992 for (t = n; t; t = rb_prev(t)) { 952 for (t = n; t; t = rb_prev(t)) {
@@ -1004,27 +964,20 @@ static void build_probe_list(struct inode *inode,
1004 atomic_inc(&u->ref); 964 atomic_inc(&u->ref);
1005 } 965 }
1006 } 966 }
1007 spin_unlock_irqrestore(&uprobes_treelock, flags); 967 spin_unlock(&uprobes_treelock);
1008} 968}
1009 969
1010/* 970/*
1011 * Called from mmap_region. 971 * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
1012 * called with mm->mmap_sem acquired.
1013 * 972 *
1014 * Return -ve no if we fail to insert probes and we cannot 973 * Currently we ignore all errors and always return 0, the callers
1015 * bail-out. 974 * can't handle the failure anyway.
1016 * Return 0 otherwise. i.e:
1017 *
1018 * - successful insertion of probes
1019 * - (or) no possible probes to be inserted.
1020 * - (or) insertion of probes failed but we can bail-out.
1021 */ 975 */
1022int uprobe_mmap(struct vm_area_struct *vma) 976int uprobe_mmap(struct vm_area_struct *vma)
1023{ 977{
1024 struct list_head tmp_list; 978 struct list_head tmp_list;
1025 struct uprobe *uprobe, *u; 979 struct uprobe *uprobe, *u;
1026 struct inode *inode; 980 struct inode *inode;
1027 int ret, count;
1028 981
1029 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
1030 return 0; 983 return 0;
@@ -1036,44 +989,35 @@ int uprobe_mmap(struct vm_area_struct *vma)
1036 mutex_lock(uprobes_mmap_hash(inode)); 989 mutex_lock(uprobes_mmap_hash(inode));
1037 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1038 991
1039 ret = 0;
1040 count = 0;
1041
1042 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1043 if (!ret) { 993 if (!fatal_signal_pending(current)) {
1044 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1045 995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1046 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1047 /*
1048 * We can race against uprobe_register(), see the
1049 * comment near uprobe_hash().
1050 */
1051 if (ret == -EEXIST) {
1052 ret = 0;
1053
1054 if (!is_swbp_at_addr(vma->vm_mm, vaddr))
1055 continue;
1056
1057 /*
1058 * Unable to insert a breakpoint, but
1059 * breakpoint lies underneath. Increment the
1060 * probe count.
1061 */
1062 atomic_inc(&vma->vm_mm->uprobes_state.count);
1063 }
1064
1065 if (!ret)
1066 count++;
1067 } 996 }
1068 put_uprobe(uprobe); 997 put_uprobe(uprobe);
1069 } 998 }
1070
1071 mutex_unlock(uprobes_mmap_hash(inode)); 999 mutex_unlock(uprobes_mmap_hash(inode));
1072 1000
1073 if (ret) 1001 return 0;
1074 atomic_sub(count, &vma->vm_mm->uprobes_state.count); 1002}
1075 1003
1076 return ret; 1004static bool
1005vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1006{
1007 loff_t min, max;
1008 struct inode *inode;
1009 struct rb_node *n;
1010
1011 inode = vma->vm_file->f_mapping->host;
1012
1013 min = vaddr_to_offset(vma, start);
1014 max = min + (end - start) - 1;
1015
1016 spin_lock(&uprobes_treelock);
1017 n = find_node_in_range(inode, min, max);
1018 spin_unlock(&uprobes_treelock);
1019
1020 return !!n;
1077} 1021}
1078 1022
1079/* 1023/*
@@ -1081,37 +1025,18 @@ int uprobe_mmap(struct vm_area_struct *vma)
1081 */ 1025 */
1082void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1083{ 1027{
1084 struct list_head tmp_list;
1085 struct uprobe *uprobe, *u;
1086 struct inode *inode;
1087
1088 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1089 return; 1029 return;
1090 1030
1091 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1092 return; 1032 return;
1093 1033
1094 if (!atomic_read(&vma->vm_mm->uprobes_state.count)) 1034 if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1035 test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1095 return; 1036 return;
1096 1037
1097 inode = vma->vm_file->f_mapping->host; 1038 if (vma_has_uprobes(vma, start, end))
1098 if (!inode) 1039 set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1099 return;
1100
1101 mutex_lock(uprobes_mmap_hash(inode));
1102 build_probe_list(inode, vma, start, end, &tmp_list);
1103
1104 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1105 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1106 /*
1107 * An unregister could have removed the probe before
1108 * unmap. So check before we decrement the count.
1109 */
1110 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1111 atomic_dec(&vma->vm_mm->uprobes_state.count);
1112 put_uprobe(uprobe);
1113 }
1114 mutex_unlock(uprobes_mmap_hash(inode));
1115} 1040}
1116 1041
1117/* Slot allocation for XOL */ 1042/* Slot allocation for XOL */
@@ -1213,13 +1138,25 @@ void uprobe_clear_state(struct mm_struct *mm)
1213 kfree(area); 1138 kfree(area);
1214} 1139}
1215 1140
1216/* 1141void uprobe_start_dup_mmap(void)
1217 * uprobe_reset_state - Free the area allocated for slots. 1142{
1218 */ 1143 percpu_down_read(&dup_mmap_sem);
1219void uprobe_reset_state(struct mm_struct *mm) 1144}
1145
1146void uprobe_end_dup_mmap(void)
1220{ 1147{
1221 mm->uprobes_state.xol_area = NULL; 1148 percpu_up_read(&dup_mmap_sem);
1222 atomic_set(&mm->uprobes_state.count, 0); 1149}
1150
1151void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1152{
1153 newmm->uprobes_state.xol_area = NULL;
1154
1155 if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1156 set_bit(MMF_HAS_UPROBES, &newmm->flags);
1157 /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1158 set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1159 }
1223} 1160}
1224 1161
1225/* 1162/*
@@ -1279,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1279 vaddr = kmap_atomic(area->page); 1216 vaddr = kmap_atomic(area->page);
1280 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1281 kunmap_atomic(vaddr); 1218 kunmap_atomic(vaddr);
1219 /*
1220 * We probably need flush_icache_user_range() but it needs vma.
1221 * This should work on supported architectures too.
1222 */
1223 flush_dcache_page(area->page);
1282 1224
1283 return current->utask->xol_vaddr; 1225 return current->utask->xol_vaddr;
1284} 1226}
@@ -1430,13 +1372,57 @@ bool uprobe_deny_signal(void)
1430 */ 1372 */
1431static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) 1373static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1432{ 1374{
1433 if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) 1375 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1434 return true; 1376 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1435 1377 return true;
1436 uprobe->flags &= ~UPROBE_SKIP_SSTEP; 1378 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1379 }
1437 return false; 1380 return false;
1438} 1381}
1439 1382
1383static void mmf_recalc_uprobes(struct mm_struct *mm)
1384{
1385 struct vm_area_struct *vma;
1386
1387 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1388 if (!valid_vma(vma, false))
1389 continue;
1390 /*
1391 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called.
1394 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return;
1397 }
1398
1399 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1400}
1401
1402static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1403{
1404 struct page *page;
1405 uprobe_opcode_t opcode;
1406 int result;
1407
1408 pagefault_disable();
1409 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
1410 sizeof(opcode));
1411 pagefault_enable();
1412
1413 if (likely(result == 0))
1414 goto out;
1415
1416 result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1417 if (result < 0)
1418 return result;
1419
1420 copy_opcode(page, vaddr, &opcode);
1421 put_page(page);
1422 out:
1423 return is_swbp_insn(&opcode);
1424}
1425
1440static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1426static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1441{ 1427{
1442 struct mm_struct *mm = current->mm; 1428 struct mm_struct *mm = current->mm;
@@ -1458,6 +1444,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1458 } else { 1444 } else {
1459 *is_swbp = -EFAULT; 1445 *is_swbp = -EFAULT;
1460 } 1446 }
1447
1448 if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
1449 mmf_recalc_uprobes(mm);
1461 up_read(&mm->mmap_sem); 1450 up_read(&mm->mmap_sem);
1462 1451
1463 return uprobe; 1452 return uprobe;
@@ -1494,41 +1483,41 @@ static void handle_swbp(struct pt_regs *regs)
1494 } 1483 }
1495 return; 1484 return;
1496 } 1485 }
1486 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the
1489 * new and not-yet-analyzed uprobe at the same address, restart.
1490 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart;
1497 1494
1498 utask = current->utask; 1495 utask = current->utask;
1499 if (!utask) { 1496 if (!utask) {
1500 utask = add_utask(); 1497 utask = add_utask();
1501 /* Cannot allocate; re-execute the instruction. */ 1498 /* Cannot allocate; re-execute the instruction. */
1502 if (!utask) 1499 if (!utask)
1503 goto cleanup_ret; 1500 goto restart;
1504 } 1501 }
1505 utask->active_uprobe = uprobe; 1502
1506 handler_chain(uprobe, regs); 1503 handler_chain(uprobe, regs);
1507 if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) 1504 if (can_skip_sstep(uprobe, regs))
1508 goto cleanup_ret; 1505 goto out;
1509 1506
1510 utask->state = UTASK_SSTEP;
1511 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1512 user_enable_single_step(current); 1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1513 return; 1510 return;
1514 } 1511 }
1515 1512
1516cleanup_ret: 1513restart:
1517 if (utask) { 1514 /*
1518 utask->active_uprobe = NULL; 1515 * cannot singlestep; cannot skip instruction;
1519 utask->state = UTASK_RUNNING; 1516 * re-execute the instruction.
1520 } 1517 */
1521 if (uprobe) { 1518 instruction_pointer_set(regs, bp_vaddr);
1522 if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) 1519out:
1523 1520 put_uprobe(uprobe);
1524 /*
1525 * cannot singlestep; cannot skip instruction;
1526 * re-execute the instruction.
1527 */
1528 instruction_pointer_set(regs, bp_vaddr);
1529
1530 put_uprobe(uprobe);
1531 }
1532} 1521}
1533 1522
1534/* 1523/*
@@ -1550,7 +1539,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1550 put_uprobe(uprobe); 1539 put_uprobe(uprobe);
1551 utask->active_uprobe = NULL; 1540 utask->active_uprobe = NULL;
1552 utask->state = UTASK_RUNNING; 1541 utask->state = UTASK_RUNNING;
1553 user_disable_single_step(current);
1554 xol_free_insn_slot(current); 1542 xol_free_insn_slot(current);
1555 1543
1556 spin_lock_irq(&current->sighand->siglock); 1544 spin_lock_irq(&current->sighand->siglock);
@@ -1559,13 +1547,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1559} 1547}
1560 1548
1561/* 1549/*
1562 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on 1550 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
1563 * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and 1551 * allows the thread to return from interrupt. After that handle_swbp()
1564 * allows the thread to return from interrupt. 1552 * sets utask->active_uprobe.
1565 * 1553 *
1566 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and 1554 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
1567 * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from 1555 * and allows the thread to return from interrupt.
1568 * interrupt.
1569 * 1556 *
1570 * While returning to userspace, thread notices the TIF_UPROBE flag and calls 1557 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1571 * uprobe_notify_resume(). 1558 * uprobe_notify_resume().
@@ -1574,11 +1561,13 @@ void uprobe_notify_resume(struct pt_regs *regs)
1574{ 1561{
1575 struct uprobe_task *utask; 1562 struct uprobe_task *utask;
1576 1563
1564 clear_thread_flag(TIF_UPROBE);
1565
1577 utask = current->utask; 1566 utask = current->utask;
1578 if (!utask || utask->state == UTASK_BP_HIT) 1567 if (utask && utask->active_uprobe)
1579 handle_swbp(regs);
1580 else
1581 handle_singlestep(utask, regs); 1568 handle_singlestep(utask, regs);
1569 else
1570 handle_swbp(regs);
1582} 1571}
1583 1572
1584/* 1573/*
@@ -1587,18 +1576,10 @@ void uprobe_notify_resume(struct pt_regs *regs)
1587 */ 1576 */
1588int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1577int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1589{ 1578{
1590 struct uprobe_task *utask; 1579 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
1591
1592 if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
1593 /* task is currently not uprobed */
1594 return 0; 1580 return 0;
1595 1581
1596 utask = current->utask;
1597 if (utask)
1598 utask->state = UTASK_BP_HIT;
1599
1600 set_thread_flag(TIF_UPROBE); 1582 set_thread_flag(TIF_UPROBE);
1601
1602 return 1; 1583 return 1;
1603} 1584}
1604 1585
@@ -1633,6 +1614,9 @@ static int __init init_uprobes(void)
1633 mutex_init(&uprobes_mmap_mutex[i]); 1614 mutex_init(&uprobes_mmap_mutex[i]);
1634 } 1615 }
1635 1616
1617 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM;
1619
1636 return register_die_notifier(&uprobe_exception_nb); 1620 return register_die_notifier(&uprobe_exception_nb);
1637} 1621}
1638module_init(init_uprobes); 1622module_init(init_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..b4df21937216 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
87 } 75 }
88 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
89} 77}
@@ -322,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
322 } 310 }
323} 311}
324 312
325/**
326 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327 *
328 * If a kernel thread is launched as a result of a system call, or if
329 * it ever exits, it should generally reparent itself to kthreadd so it
330 * isn't in the way of other processes and is correctly cleaned up on exit.
331 *
332 * The various task state such as scheduling policy and priority may have
333 * been inherited from a user process, so we reset them to sane values here.
334 *
335 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336 */
337static void reparent_to_kthreadd(void)
338{
339 write_lock_irq(&tasklist_lock);
340
341 ptrace_unlink(current);
342 /* Reparent to init */
343 current->real_parent = current->parent = kthreadd_task;
344 list_move_tail(&current->sibling, &current->real_parent->children);
345
346 /* Set the exit signal to SIGCHLD so we signal init on exit */
347 current->exit_signal = SIGCHLD;
348
349 if (task_nice(current) < 0)
350 set_user_nice(current, 0);
351 /* cpus_allowed? */
352 /* rt_priority? */
353 /* signals? */
354 memcpy(current->signal->rlim, init_task.signal->rlim,
355 sizeof(current->signal->rlim));
356
357 atomic_inc(&init_cred.usage);
358 commit_creds(&init_cred);
359 write_unlock_irq(&tasklist_lock);
360}
361
362void __set_special_pids(struct pid *pid) 313void __set_special_pids(struct pid *pid)
363{ 314{
364 struct task_struct *curr = current->group_leader; 315 struct task_struct *curr = current->group_leader;
@@ -370,13 +321,6 @@ void __set_special_pids(struct pid *pid)
370 change_pid(curr, PIDTYPE_PGID, pid); 321 change_pid(curr, PIDTYPE_PGID, pid);
371} 322}
372 323
373static void set_special_pids(struct pid *pid)
374{
375 write_lock_irq(&tasklist_lock);
376 __set_special_pids(pid);
377 write_unlock_irq(&tasklist_lock);
378}
379
380/* 324/*
381 * Let kernel threads use this to say that they allow a certain signal. 325 * Let kernel threads use this to say that they allow a certain signal.
382 * Must not be used if kthread was cloned with CLONE_SIGHAND. 326 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -416,149 +360,6 @@ int disallow_signal(int sig)
416 360
417EXPORT_SYMBOL(disallow_signal); 361EXPORT_SYMBOL(disallow_signal);
418 362
419/*
420 * Put all the gunge required to become a kernel thread without
421 * attached user resources in one place where it belongs.
422 */
423
424void daemonize(const char *name, ...)
425{
426 va_list args;
427 sigset_t blocked;
428
429 va_start(args, name);
430 vsnprintf(current->comm, sizeof(current->comm), name, args);
431 va_end(args);
432
433 /*
434 * If we were started as result of loading a module, close all of the
435 * user space pages. We don't need them, and if we didn't close them
436 * they would be locked into memory.
437 */
438 exit_mm(current);
439 /*
440 * We don't want to get frozen, in case system-wide hibernation
441 * or suspend transition begins right now.
442 */
443 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
444
445 if (current->nsproxy != &init_nsproxy) {
446 get_nsproxy(&init_nsproxy);
447 switch_task_namespaces(current, &init_nsproxy);
448 }
449 set_special_pids(&init_struct_pid);
450 proc_clear_tty(current);
451
452 /* Block and flush all signals */
453 sigfillset(&blocked);
454 sigprocmask(SIG_BLOCK, &blocked, NULL);
455 flush_signals(current);
456
457 /* Become as one with the init task */
458
459 daemonize_fs_struct();
460 exit_files(current);
461 current->files = init_task.files;
462 atomic_inc(&current->files->count);
463
464 reparent_to_kthreadd();
465}
466
467EXPORT_SYMBOL(daemonize);
468
469static void close_files(struct files_struct * files)
470{
471 int i, j;
472 struct fdtable *fdt;
473
474 j = 0;
475
476 /*
477 * It is safe to dereference the fd table without RCU or
478 * ->file_lock because this is the last reference to the
479 * files structure. But use RCU to shut RCU-lockdep up.
480 */
481 rcu_read_lock();
482 fdt = files_fdtable(files);
483 rcu_read_unlock();
484 for (;;) {
485 unsigned long set;
486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds)
488 break;
489 set = fdt->open_fds[j++];
490 while (set) {
491 if (set & 1) {
492 struct file * file = xchg(&fdt->fd[i], NULL);
493 if (file) {
494 filp_close(file, files);
495 cond_resched();
496 }
497 }
498 i++;
499 set >>= 1;
500 }
501 }
502}
503
504struct files_struct *get_files_struct(struct task_struct *task)
505{
506 struct files_struct *files;
507
508 task_lock(task);
509 files = task->files;
510 if (files)
511 atomic_inc(&files->count);
512 task_unlock(task);
513
514 return files;
515}
516
517void put_files_struct(struct files_struct *files)
518{
519 struct fdtable *fdt;
520
521 if (atomic_dec_and_test(&files->count)) {
522 close_files(files);
523 /*
524 * Free the fd and fdset arrays if we expanded them.
525 * If the fdtable was embedded, pass files for freeing
526 * at the end of the RCU grace period. Otherwise,
527 * you can free files immediately.
528 */
529 rcu_read_lock();
530 fdt = files_fdtable(files);
531 if (fdt != &files->fdtab)
532 kmem_cache_free(files_cachep, files);
533 free_fdtable(fdt);
534 rcu_read_unlock();
535 }
536}
537
538void reset_files_struct(struct files_struct *files)
539{
540 struct task_struct *tsk = current;
541 struct files_struct *old;
542
543 old = tsk->files;
544 task_lock(tsk);
545 tsk->files = files;
546 task_unlock(tsk);
547 put_files_struct(old);
548}
549
550void exit_files(struct task_struct *tsk)
551{
552 struct files_struct * files = tsk->files;
553
554 if (files) {
555 task_lock(tsk);
556 tsk->files = NULL;
557 task_unlock(tsk);
558 put_files_struct(files);
559 }
560}
561
562#ifdef CONFIG_MM_OWNER 363#ifdef CONFIG_MM_OWNER
563/* 364/*
564 * A task is exiting. If it owned this mm, find a new owner for the mm. 365 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -1046,6 +847,9 @@ void do_exit(long code)
1046 if (tsk->splice_pipe) 847 if (tsk->splice_pipe)
1047 __free_pipe_info(tsk->splice_pipe); 848 __free_pipe_info(tsk->splice_pipe);
1048 849
850 if (tsk->task_frag.page)
851 put_page(tsk->task_frag.page);
852
1049 validate_creds_for_do_exit(tsk); 853 validate_creds_for_do_exit(tsk);
1050 854
1051 preempt_disable(); 855 preempt_disable();
@@ -1278,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1278 * as other threads in the parent group can be right 1082 * as other threads in the parent group can be right
1279 * here reaping other children at the same time. 1083 * here reaping other children at the same time.
1280 * 1084 *
1281 * We use thread_group_times() to get times for the thread 1085 * We use thread_group_cputime_adjusted() to get times for the thread
1282 * group, which consolidates times for all threads in the 1086 * group, which consolidates times for all threads in the
1283 * group including the group leader. 1087 * group including the group leader.
1284 */ 1088 */
1285 thread_group_times(p, &tgutime, &tgstime); 1089 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1286 spin_lock_irq(&p->real_parent->sighand->siglock); 1090 spin_lock_irq(&p->real_parent->sighand->siglock);
1287 psig = p->real_parent->signal; 1091 psig = p->real_parent->signal;
1288 sig = p->signal; 1092 sig = p->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c8857e12855..a31b823b3c2d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 147 int node)
148{ 148{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
150 THREAD_SIZE_ORDER); 150 THREAD_SIZE_ORDER);
151 151
152 return page ? page_address(page) : NULL; 152 return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
154 154
155static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
156{ 156{
157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158} 158}
159# else 159# else
160static struct kmem_cache *thread_info_cache; 160static struct kmem_cache *thread_info_cache;
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
330 tsk->btrace_seq = 0; 330 tsk->btrace_seq = 0;
331#endif 331#endif
332 tsk->splice_pipe = NULL; 332 tsk->splice_pipe = NULL;
333 tsk->task_frag.page = NULL;
333 334
334 account_kernel_stack(ti, 1); 335 account_kernel_stack(ti, 1);
335 336
@@ -351,8 +352,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
351 unsigned long charge; 352 unsigned long charge;
352 struct mempolicy *pol; 353 struct mempolicy *pol;
353 354
355 uprobe_start_dup_mmap();
354 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
355 flush_cache_dup_mm(oldmm); 357 flush_cache_dup_mm(oldmm);
358 uprobe_dup_mmap(oldmm, mm);
356 /* 359 /*
357 * Not linked in yet - no deadlock potential: 360 * Not linked in yet - no deadlock potential:
358 */ 361 */
@@ -421,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
421 mapping->i_mmap_writable++; 424 mapping->i_mmap_writable++;
422 flush_dcache_mmap_lock(mapping); 425 flush_dcache_mmap_lock(mapping);
423 /* insert tmp into the share list, just after mpnt */ 426 /* insert tmp into the share list, just after mpnt */
424 vma_prio_tree_add(tmp, mpnt); 427 if (unlikely(tmp->vm_flags & VM_NONLINEAR))
428 vma_nonlinear_insert(tmp,
429 &mapping->i_mmap_nonlinear);
430 else
431 vma_interval_tree_insert_after(tmp, mpnt,
432 &mapping->i_mmap);
425 flush_dcache_mmap_unlock(mapping); 433 flush_dcache_mmap_unlock(mapping);
426 mutex_unlock(&mapping->i_mmap_mutex); 434 mutex_unlock(&mapping->i_mmap_mutex);
427 } 435 }
@@ -454,9 +462,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
454 462
455 if (retval) 463 if (retval)
456 goto out; 464 goto out;
457
458 if (file)
459 uprobe_mmap(tmp);
460 } 465 }
461 /* a new mm has just been created */ 466 /* a new mm has just been created */
462 arch_dup_mmap(oldmm, mm); 467 arch_dup_mmap(oldmm, mm);
@@ -465,6 +470,7 @@ out:
465 up_write(&mm->mmap_sem); 470 up_write(&mm->mmap_sem);
466 flush_tlb_mm(oldmm); 471 flush_tlb_mm(oldmm);
467 up_write(&oldmm->mmap_sem); 472 up_write(&oldmm->mmap_sem);
473 uprobe_end_dup_mmap();
468 return retval; 474 return retval;
469fail_nomem_anon_vma_fork: 475fail_nomem_anon_vma_fork:
470 mpol_put(pol); 476 mpol_put(pol);
@@ -623,26 +629,6 @@ void mmput(struct mm_struct *mm)
623} 629}
624EXPORT_SYMBOL_GPL(mmput); 630EXPORT_SYMBOL_GPL(mmput);
625 631
626/*
627 * We added or removed a vma mapping the executable. The vmas are only mapped
628 * during exec and are not mapped with the mmap system call.
629 * Callers must hold down_write() on the mm's mmap_sem for these
630 */
631void added_exe_file_vma(struct mm_struct *mm)
632{
633 mm->num_exe_file_vmas++;
634}
635
636void removed_exe_file_vma(struct mm_struct *mm)
637{
638 mm->num_exe_file_vmas--;
639 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
640 fput(mm->exe_file);
641 mm->exe_file = NULL;
642 }
643
644}
645
646void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 632void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
647{ 633{
648 if (new_exe_file) 634 if (new_exe_file)
@@ -650,15 +636,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
650 if (mm->exe_file) 636 if (mm->exe_file)
651 fput(mm->exe_file); 637 fput(mm->exe_file);
652 mm->exe_file = new_exe_file; 638 mm->exe_file = new_exe_file;
653 mm->num_exe_file_vmas = 0;
654} 639}
655 640
656struct file *get_mm_exe_file(struct mm_struct *mm) 641struct file *get_mm_exe_file(struct mm_struct *mm)
657{ 642{
658 struct file *exe_file; 643 struct file *exe_file;
659 644
660 /* We need mmap_sem to protect against races with removal of 645 /* We need mmap_sem to protect against races with removal of exe_file */
661 * VM_EXECUTABLE vmas */
662 down_read(&mm->mmap_sem); 646 down_read(&mm->mmap_sem);
663 exe_file = mm->exe_file; 647 exe_file = mm->exe_file;
664 if (exe_file) 648 if (exe_file)
@@ -839,8 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
839#ifdef CONFIG_TRANSPARENT_HUGEPAGE 823#ifdef CONFIG_TRANSPARENT_HUGEPAGE
840 mm->pmd_huge_pte = NULL; 824 mm->pmd_huge_pte = NULL;
841#endif 825#endif
842 uprobe_reset_state(mm); 826#ifdef CONFIG_NUMA_BALANCING
843 827 mm->first_nid = NUMA_PTE_SCAN_INIT;
828#endif
844 if (!mm_init(mm, tsk)) 829 if (!mm_init(mm, tsk))
845 goto fail_nomem; 830 goto fail_nomem;
846 831
@@ -1059,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1059 atomic_set(&sig->live, 1); 1044 atomic_set(&sig->live, 1);
1060 atomic_set(&sig->sigcnt, 1); 1045 atomic_set(&sig->sigcnt, 1);
1061 init_waitqueue_head(&sig->wait_chldexit); 1046 init_waitqueue_head(&sig->wait_chldexit);
1062 if (clone_flags & CLONE_NEWPID)
1063 sig->flags |= SIGNAL_UNKILLABLE;
1064 sig->curr_target = tsk; 1047 sig->curr_target = tsk;
1065 init_sigpending(&sig->shared_pending); 1048 init_sigpending(&sig->shared_pending);
1066 INIT_LIST_HEAD(&sig->posix_timers); 1049 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1081,7 +1064,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1081 init_rwsem(&sig->group_rwsem); 1064 init_rwsem(&sig->group_rwsem);
1082#endif 1065#endif
1083 1066
1084 sig->oom_adj = current->signal->oom_adj;
1085 sig->oom_score_adj = current->signal->oom_score_adj; 1067 sig->oom_score_adj = current->signal->oom_score_adj;
1086 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1068 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1087 1069
@@ -1148,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1148 */ 1130 */
1149static struct task_struct *copy_process(unsigned long clone_flags, 1131static struct task_struct *copy_process(unsigned long clone_flags,
1150 unsigned long stack_start, 1132 unsigned long stack_start,
1151 struct pt_regs *regs,
1152 unsigned long stack_size, 1133 unsigned long stack_size,
1153 int __user *child_tidptr, 1134 int __user *child_tidptr,
1154 struct pid *pid, 1135 struct pid *pid,
@@ -1156,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1156{ 1137{
1157 int retval; 1138 int retval;
1158 struct task_struct *p; 1139 struct task_struct *p;
1159 int cgroup_callbacks_done = 0;
1160 1140
1161 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1162 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1243,7 +1223,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1243 p->utime = p->stime = p->gtime = 0; 1223 p->utime = p->stime = p->gtime = 0;
1244 p->utimescaled = p->stimescaled = 0; 1224 p->utimescaled = p->stimescaled = 0;
1245#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1225#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1246 p->prev_utime = p->prev_stime = 0; 1226 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1247#endif 1227#endif
1248#if defined(SPLIT_RSS_COUNTING) 1228#if defined(SPLIT_RSS_COUNTING)
1249 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1229 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1280,11 +1260,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1280#endif 1260#endif
1281#ifdef CONFIG_TRACE_IRQFLAGS 1261#ifdef CONFIG_TRACE_IRQFLAGS
1282 p->irq_events = 0; 1262 p->irq_events = 0;
1283#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1284 p->hardirqs_enabled = 1;
1285#else
1286 p->hardirqs_enabled = 0; 1263 p->hardirqs_enabled = 0;
1287#endif
1288 p->hardirq_enable_ip = 0; 1264 p->hardirq_enable_ip = 0;
1289 p->hardirq_enable_event = 0; 1265 p->hardirq_enable_event = 0;
1290 p->hardirq_disable_ip = _THIS_IP_; 1266 p->hardirq_disable_ip = _THIS_IP_;
@@ -1345,7 +1321,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1345 retval = copy_io(clone_flags, p); 1321 retval = copy_io(clone_flags, p);
1346 if (retval) 1322 if (retval)
1347 goto bad_fork_cleanup_namespaces; 1323 goto bad_fork_cleanup_namespaces;
1348 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1324 retval = copy_thread(clone_flags, stack_start, stack_size, p);
1349 if (retval) 1325 if (retval)
1350 goto bad_fork_cleanup_io; 1326 goto bad_fork_cleanup_io;
1351 1327
@@ -1418,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1418 INIT_LIST_HEAD(&p->thread_group); 1394 INIT_LIST_HEAD(&p->thread_group);
1419 p->task_works = NULL; 1395 p->task_works = NULL;
1420 1396
1421 /* Now that the task is set up, run cgroup callbacks if
1422 * necessary. We need to run them before the task is visible
1423 * on the tasklist. */
1424 cgroup_fork_callbacks(p);
1425 cgroup_callbacks_done = 1;
1426
1427 /* Need tasklist lock for parent etc handling! */ 1397 /* Need tasklist lock for parent etc handling! */
1428 write_lock_irq(&tasklist_lock); 1398 write_lock_irq(&tasklist_lock);
1429 1399
@@ -1466,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1466 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1436 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1467 1437
1468 if (thread_group_leader(p)) { 1438 if (thread_group_leader(p)) {
1469 if (is_child_reaper(pid)) 1439 if (is_child_reaper(pid)) {
1470 p->nsproxy->pid_ns->child_reaper = p; 1440 ns_of_pid(pid)->child_reaper = p;
1441 p->signal->flags |= SIGNAL_UNKILLABLE;
1442 }
1471 1443
1472 p->signal->leader_pid = pid; 1444 p->signal->leader_pid = pid;
1473 p->signal->tty = tty_kref_get(current->signal->tty); 1445 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1501,8 +1473,6 @@ bad_fork_cleanup_io:
1501 if (p->io_context) 1473 if (p->io_context)
1502 exit_io_context(p); 1474 exit_io_context(p);
1503bad_fork_cleanup_namespaces: 1475bad_fork_cleanup_namespaces:
1504 if (unlikely(clone_flags & CLONE_NEWPID))
1505 pid_ns_release_proc(p->nsproxy->pid_ns);
1506 exit_task_namespaces(p); 1476 exit_task_namespaces(p);
1507bad_fork_cleanup_mm: 1477bad_fork_cleanup_mm:
1508 if (p->mm) 1478 if (p->mm)
@@ -1528,7 +1498,7 @@ bad_fork_cleanup_cgroup:
1528#endif 1498#endif
1529 if (clone_flags & CLONE_THREAD) 1499 if (clone_flags & CLONE_THREAD)
1530 threadgroup_change_end(current); 1500 threadgroup_change_end(current);
1531 cgroup_exit(p, cgroup_callbacks_done); 1501 cgroup_exit(p, 0);
1532 delayacct_tsk_free(p); 1502 delayacct_tsk_free(p);
1533 module_put(task_thread_info(p)->exec_domain->module); 1503 module_put(task_thread_info(p)->exec_domain->module);
1534bad_fork_cleanup_count: 1504bad_fork_cleanup_count:
@@ -1540,12 +1510,6 @@ fork_out:
1540 return ERR_PTR(retval); 1510 return ERR_PTR(retval);
1541} 1511}
1542 1512
1543noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1544{
1545 memset(regs, 0, sizeof(struct pt_regs));
1546 return regs;
1547}
1548
1549static inline void init_idle_pids(struct pid_link *links) 1513static inline void init_idle_pids(struct pid_link *links)
1550{ 1514{
1551 enum pid_type type; 1515 enum pid_type type;
@@ -1559,10 +1523,7 @@ static inline void init_idle_pids(struct pid_link *links)
1559struct task_struct * __cpuinit fork_idle(int cpu) 1523struct task_struct * __cpuinit fork_idle(int cpu)
1560{ 1524{
1561 struct task_struct *task; 1525 struct task_struct *task;
1562 struct pt_regs regs; 1526 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
1563
1564 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1565 &init_struct_pid, 0);
1566 if (!IS_ERR(task)) { 1527 if (!IS_ERR(task)) {
1567 init_idle_pids(task->pids); 1528 init_idle_pids(task->pids);
1568 init_idle(task, cpu); 1529 init_idle(task, cpu);
@@ -1579,7 +1540,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1579 */ 1540 */
1580long do_fork(unsigned long clone_flags, 1541long do_fork(unsigned long clone_flags,
1581 unsigned long stack_start, 1542 unsigned long stack_start,
1582 struct pt_regs *regs,
1583 unsigned long stack_size, 1543 unsigned long stack_size,
1584 int __user *parent_tidptr, 1544 int __user *parent_tidptr,
1585 int __user *child_tidptr) 1545 int __user *child_tidptr)
@@ -1592,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
1592 * Do some preliminary argument and permissions checking before we 1552 * Do some preliminary argument and permissions checking before we
1593 * actually start allocating stuff 1553 * actually start allocating stuff
1594 */ 1554 */
1595 if (clone_flags & CLONE_NEWUSER) { 1555 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1596 if (clone_flags & CLONE_THREAD) 1556 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1597 return -EINVAL; 1557 return -EINVAL;
1598 /* hopefully this check will go away when userns support is
1599 * complete
1600 */
1601 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1602 !capable(CAP_SETGID))
1603 return -EPERM;
1604 } 1558 }
1605 1559
1606 /* 1560 /*
@@ -1609,7 +1563,7 @@ long do_fork(unsigned long clone_flags,
1609 * requested, no event is reported; otherwise, report if the event 1563 * requested, no event is reported; otherwise, report if the event
1610 * for the type of forking is enabled. 1564 * for the type of forking is enabled.
1611 */ 1565 */
1612 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { 1566 if (!(clone_flags & CLONE_UNTRACED)) {
1613 if (clone_flags & CLONE_VFORK) 1567 if (clone_flags & CLONE_VFORK)
1614 trace = PTRACE_EVENT_VFORK; 1568 trace = PTRACE_EVENT_VFORK;
1615 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1569 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1621,7 +1575,7 @@ long do_fork(unsigned long clone_flags,
1621 trace = 0; 1575 trace = 0;
1622 } 1576 }
1623 1577
1624 p = copy_process(clone_flags, stack_start, regs, stack_size, 1578 p = copy_process(clone_flags, stack_start, stack_size,
1625 child_tidptr, NULL, trace); 1579 child_tidptr, NULL, trace);
1626 /* 1580 /*
1627 * Do this prior waking up the new thread - the thread pointer 1581 * Do this prior waking up the new thread - the thread pointer
@@ -1659,6 +1613,58 @@ long do_fork(unsigned long clone_flags,
1659 return nr; 1613 return nr;
1660} 1614}
1661 1615
1616/*
1617 * Create a kernel thread.
1618 */
1619pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1620{
1621 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1622 (unsigned long)arg, NULL, NULL);
1623}
1624
1625#ifdef __ARCH_WANT_SYS_FORK
1626SYSCALL_DEFINE0(fork)
1627{
1628#ifdef CONFIG_MMU
1629 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1630#else
1631 /* can not support in nommu mode */
1632 return(-EINVAL);
1633#endif
1634}
1635#endif
1636
1637#ifdef __ARCH_WANT_SYS_VFORK
1638SYSCALL_DEFINE0(vfork)
1639{
1640 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1641 0, NULL, NULL);
1642}
1643#endif
1644
1645#ifdef __ARCH_WANT_SYS_CLONE
1646#ifdef CONFIG_CLONE_BACKWARDS
1647SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1648 int __user *, parent_tidptr,
1649 int, tls_val,
1650 int __user *, child_tidptr)
1651#elif defined(CONFIG_CLONE_BACKWARDS2)
1652SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1653 int __user *, parent_tidptr,
1654 int __user *, child_tidptr,
1655 int, tls_val)
1656#else
1657SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1658 int __user *, parent_tidptr,
1659 int __user *, child_tidptr,
1660 int, tls_val)
1661#endif
1662{
1663 return do_fork(clone_flags, newsp, 0,
1664 parent_tidptr, child_tidptr);
1665}
1666#endif
1667
1662#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1668#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1663#define ARCH_MIN_MMSTRUCT_ALIGN 0 1669#define ARCH_MIN_MMSTRUCT_ALIGN 0
1664#endif 1670#endif
@@ -1708,7 +1714,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1708{ 1714{
1709 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1715 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1710 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1716 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1711 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1717 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1718 CLONE_NEWUSER|CLONE_NEWPID))
1712 return -EINVAL; 1719 return -EINVAL;
1713 /* 1720 /*
1714 * Not implemented, but pretend it works if there is nothing to 1721 * Not implemented, but pretend it works if there is nothing to
@@ -1775,19 +1782,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1775{ 1782{
1776 struct fs_struct *fs, *new_fs = NULL; 1783 struct fs_struct *fs, *new_fs = NULL;
1777 struct files_struct *fd, *new_fd = NULL; 1784 struct files_struct *fd, *new_fd = NULL;
1785 struct cred *new_cred = NULL;
1778 struct nsproxy *new_nsproxy = NULL; 1786 struct nsproxy *new_nsproxy = NULL;
1779 int do_sysvsem = 0; 1787 int do_sysvsem = 0;
1780 int err; 1788 int err;
1781 1789
1782 err = check_unshare_flags(unshare_flags); 1790 /*
1783 if (err) 1791 * If unsharing a user namespace must also unshare the thread.
1784 goto bad_unshare_out; 1792 */
1785 1793 if (unshare_flags & CLONE_NEWUSER)
1794 unshare_flags |= CLONE_THREAD;
1795 /*
1796 * If unsharing a pid namespace must also unshare the thread.
1797 */
1798 if (unshare_flags & CLONE_NEWPID)
1799 unshare_flags |= CLONE_THREAD;
1800 /*
1801 * If unsharing a thread from a thread group, must also unshare vm.
1802 */
1803 if (unshare_flags & CLONE_THREAD)
1804 unshare_flags |= CLONE_VM;
1805 /*
1806 * If unsharing vm, must also unshare signal handlers.
1807 */
1808 if (unshare_flags & CLONE_VM)
1809 unshare_flags |= CLONE_SIGHAND;
1786 /* 1810 /*
1787 * If unsharing namespace, must also unshare filesystem information. 1811 * If unsharing namespace, must also unshare filesystem information.
1788 */ 1812 */
1789 if (unshare_flags & CLONE_NEWNS) 1813 if (unshare_flags & CLONE_NEWNS)
1790 unshare_flags |= CLONE_FS; 1814 unshare_flags |= CLONE_FS;
1815
1816 err = check_unshare_flags(unshare_flags);
1817 if (err)
1818 goto bad_unshare_out;
1791 /* 1819 /*
1792 * CLONE_NEWIPC must also detach from the undolist: after switching 1820 * CLONE_NEWIPC must also detach from the undolist: after switching
1793 * to a new ipc namespace, the semaphore arrays from the old 1821 * to a new ipc namespace, the semaphore arrays from the old
@@ -1801,11 +1829,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1801 err = unshare_fd(unshare_flags, &new_fd); 1829 err = unshare_fd(unshare_flags, &new_fd);
1802 if (err) 1830 if (err)
1803 goto bad_unshare_cleanup_fs; 1831 goto bad_unshare_cleanup_fs;
1804 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1832 err = unshare_userns(unshare_flags, &new_cred);
1805 if (err) 1833 if (err)
1806 goto bad_unshare_cleanup_fd; 1834 goto bad_unshare_cleanup_fd;
1835 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1836 new_cred, new_fs);
1837 if (err)
1838 goto bad_unshare_cleanup_cred;
1807 1839
1808 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1840 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1809 if (do_sysvsem) { 1841 if (do_sysvsem) {
1810 /* 1842 /*
1811 * CLONE_SYSVSEM is equivalent to sys_exit(). 1843 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1838,11 +1870,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1838 } 1870 }
1839 1871
1840 task_unlock(current); 1872 task_unlock(current);
1873
1874 if (new_cred) {
1875 /* Install the new user namespace */
1876 commit_creds(new_cred);
1877 new_cred = NULL;
1878 }
1841 } 1879 }
1842 1880
1843 if (new_nsproxy) 1881 if (new_nsproxy)
1844 put_nsproxy(new_nsproxy); 1882 put_nsproxy(new_nsproxy);
1845 1883
1884bad_unshare_cleanup_cred:
1885 if (new_cred)
1886 put_cred(new_cred);
1846bad_unshare_cleanup_fd: 1887bad_unshare_cleanup_fd:
1847 if (new_fd) 1888 if (new_fd)
1848 put_files_struct(new_fd); 1889 put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..19eb089ca003 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
716 struct futex_pi_state **ps, 716 struct futex_pi_state **ps,
717 struct task_struct *task, int set_waiters) 717 struct task_struct *task, int set_waiters)
718{ 718{
719 int lock_taken, ret, ownerdied = 0; 719 int lock_taken, ret, force_take = 0;
720 u32 uval, newval, curval, vpid = task_pid_vnr(task); 720 u32 uval, newval, curval, vpid = task_pid_vnr(task);
721 721
722retry: 722retry:
@@ -755,17 +755,15 @@ retry:
755 newval = curval | FUTEX_WAITERS; 755 newval = curval | FUTEX_WAITERS;
756 756
757 /* 757 /*
758 * There are two cases, where a futex might have no owner (the 758 * Should we force take the futex? See below.
759 * owner TID is 0): OWNER_DIED. We take over the futex in this
760 * case. We also do an unconditional take over, when the owner
761 * of the futex died.
762 *
763 * This is safe as we are protected by the hash bucket lock !
764 */ 759 */
765 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 760 if (unlikely(force_take)) {
766 /* Keep the OWNER_DIED bit */ 761 /*
762 * Keep the OWNER_DIED and the WAITERS bit and set the
763 * new TID value.
764 */
767 newval = (curval & ~FUTEX_TID_MASK) | vpid; 765 newval = (curval & ~FUTEX_TID_MASK) | vpid;
768 ownerdied = 0; 766 force_take = 0;
769 lock_taken = 1; 767 lock_taken = 1;
770 } 768 }
771 769
@@ -775,7 +773,7 @@ retry:
775 goto retry; 773 goto retry;
776 774
777 /* 775 /*
778 * We took the lock due to owner died take over. 776 * We took the lock due to forced take over.
779 */ 777 */
780 if (unlikely(lock_taken)) 778 if (unlikely(lock_taken))
781 return 1; 779 return 1;
@@ -790,20 +788,25 @@ retry:
790 switch (ret) { 788 switch (ret) {
791 case -ESRCH: 789 case -ESRCH:
792 /* 790 /*
793 * No owner found for this futex. Check if the 791 * We failed to find an owner for this
794 * OWNER_DIED bit is set to figure out whether 792 * futex. So we have no pi_state to block
795 * this is a robust futex or not. 793 * on. This can happen in two cases:
794 *
795 * 1) The owner died
796 * 2) A stale FUTEX_WAITERS bit
797 *
798 * Re-read the futex value.
796 */ 799 */
797 if (get_futex_value_locked(&curval, uaddr)) 800 if (get_futex_value_locked(&curval, uaddr))
798 return -EFAULT; 801 return -EFAULT;
799 802
800 /* 803 /*
801 * We simply start over in case of a robust 804 * If the owner died or we have a stale
802 * futex. The code above will take the futex 805 * WAITERS bit the owner TID in the user space
803 * and return happy. 806 * futex is 0.
804 */ 807 */
805 if (curval & FUTEX_OWNER_DIED) { 808 if (!(curval & FUTEX_TID_MASK)) {
806 ownerdied = 1; 809 force_take = 1;
807 goto retry; 810 goto retry;
808 } 811 }
809 default: 812 default:
@@ -840,6 +843,9 @@ static void wake_futex(struct futex_q *q)
840{ 843{
841 struct task_struct *p = q->task; 844 struct task_struct *p = q->task;
842 845
846 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
847 return;
848
843 /* 849 /*
844 * We set q->lock_ptr = NULL _before_ we wake up the task. If 850 * We set q->lock_ptr = NULL _before_ we wake up the task. If
845 * a non-futex wake up happens on another CPU then the task 851 * a non-futex wake up happens on another CPU then the task
@@ -1075,6 +1081,10 @@ retry_private:
1075 1081
1076 plist_for_each_entry_safe(this, next, head, list) { 1082 plist_for_each_entry_safe(this, next, head, list) {
1077 if (match_futex (&this->key, &key1)) { 1083 if (match_futex (&this->key, &key1)) {
1084 if (this->pi_state || this->rt_waiter) {
1085 ret = -EINVAL;
1086 goto out_unlock;
1087 }
1078 wake_futex(this); 1088 wake_futex(this);
1079 if (++ret >= nr_wake) 1089 if (++ret >= nr_wake)
1080 break; 1090 break;
@@ -1087,6 +1097,10 @@ retry_private:
1087 op_ret = 0; 1097 op_ret = 0;
1088 plist_for_each_entry_safe(this, next, head, list) { 1098 plist_for_each_entry_safe(this, next, head, list) {
1089 if (match_futex (&this->key, &key2)) { 1099 if (match_futex (&this->key, &key2)) {
1100 if (this->pi_state || this->rt_waiter) {
1101 ret = -EINVAL;
1102 goto out_unlock;
1103 }
1090 wake_futex(this); 1104 wake_futex(this);
1091 if (++op_ret >= nr_wake2) 1105 if (++op_ret >= nr_wake2)
1092 break; 1106 break;
@@ -1095,6 +1109,7 @@ retry_private:
1095 ret += op_ret; 1109 ret += op_ret;
1096 } 1110 }
1097 1111
1112out_unlock:
1098 double_unlock_hb(hb1, hb2); 1113 double_unlock_hb(hb1, hb2);
1099out_put_keys: 1114out_put_keys:
1100 put_futex_key(&key2); 1115 put_futex_key(&key2);
@@ -1384,9 +1399,13 @@ retry_private:
1384 /* 1399 /*
1385 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1400 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1386 * be paired with each other and no other futex ops. 1401 * be paired with each other and no other futex ops.
1402 *
1403 * We should never be requeueing a futex_q with a pi_state,
1404 * which is awaiting a futex_unlock_pi().
1387 */ 1405 */
1388 if ((requeue_pi && !this->rt_waiter) || 1406 if ((requeue_pi && !this->rt_waiter) ||
1389 (!requeue_pi && this->rt_waiter)) { 1407 (!requeue_pi && this->rt_waiter) ||
1408 this->pi_state) {
1390 ret = -EINVAL; 1409 ret = -EINVAL;
1391 break; 1410 break;
1392 } 1411 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eebd6d5cfb44..3aca9f29d30e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq)
272 272
273 raw_spin_lock_irq(&desc->lock); 273 raw_spin_lock_irq(&desc->lock);
274 274
275 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
275 kstat_incr_irqs_this_cpu(irq, desc); 276 kstat_incr_irqs_this_cpu(irq, desc);
276 277
277 action = desc->action; 278 action = desc->action;
@@ -671,6 +672,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
671 irq_set_chip(irq, chip); 672 irq_set_chip(irq, chip);
672 __irq_set_handler(irq, handle, 0, name); 673 __irq_set_handler(irq, handle, 0, name);
673} 674}
675EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
674 676
675void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 677void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
676{ 678{
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index b5fcd96c7102..988dc58e8847 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,6 +6,7 @@
6 */ 6 */
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/export.h>
9 10
10#include "internals.h" 11#include "internals.h"
11 12
@@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = {
57 .irq_mask = noop, 58 .irq_mask = noop,
58 .irq_unmask = noop, 59 .irq_unmask = noop,
59}; 60};
61EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 49a77727db42..96f3a1d9c379 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
148 * @host_data: Controller private data pointer 148 * @host_data: Controller private data pointer
149 * 149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear 150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise. 151 * domain otherwise. For the legacy domain, IRQ descriptors will also
152 * be allocated.
152 * 153 *
153 * This is intended to implement the expected behaviour for most 154 * This is intended to implement the expected behaviour for most
154 * interrupt controllers which is that a linear mapping should 155 * interrupt controllers which is that a linear mapping should
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
162 const struct irq_domain_ops *ops, 163 const struct irq_domain_ops *ops,
163 void *host_data) 164 void *host_data)
164{ 165{
165 if (first_irq > 0) 166 if (first_irq > 0) {
166 return irq_domain_add_legacy(of_node, size, first_irq, 0, 167 int irq_base;
168
169 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
170 /*
171 * Set the descriptor allocator to search for a
172 * 1-to-1 mapping, such as irq_alloc_desc_at().
173 * Use of_node_to_nid() which is defined to
174 * numa_node_id() on platforms that have no custom
175 * implementation.
176 */
177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node));
179 if (irq_base < 0) {
180 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq);
182 irq_base = first_irq;
183 }
184 } else
185 irq_base = first_irq;
186
187 return irq_domain_add_legacy(of_node, size, irq_base, 0,
167 ops, host_data); 188 ops, host_data);
168 else 189 }
169 return irq_domain_add_linear(of_node, size, ops, host_data); 190
191 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data);
170} 193}
171 194
172/** 195/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c69326aa773..e49a288fa479 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
616 return ret; 616 return ret;
617} 617}
618 618
619#ifdef CONFIG_HARDIRQS_SW_RESEND
620int irq_set_parent(int irq, int parent_irq)
621{
622 unsigned long flags;
623 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
624
625 if (!desc)
626 return -EINVAL;
627
628 desc->parent_irq = parent_irq;
629
630 irq_put_desc_unlock(desc, flags);
631 return 0;
632}
633#endif
634
619/* 635/*
620 * Default primary interrupt handler for threaded interrupts. Is 636 * Default primary interrupt handler for threaded interrupts. Is
621 * assigned as primary handler when request_threaded_irq is called 637 * assigned as primary handler when request_threaded_irq is called
@@ -716,6 +732,7 @@ static void
716irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 732irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
717{ 733{
718 cpumask_var_t mask; 734 cpumask_var_t mask;
735 bool valid = true;
719 736
720 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) 737 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
721 return; 738 return;
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
730 } 747 }
731 748
732 raw_spin_lock_irq(&desc->lock); 749 raw_spin_lock_irq(&desc->lock);
733 cpumask_copy(mask, desc->irq_data.affinity); 750 /*
751 * This code is triggered unconditionally. Check the affinity
752 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
753 */
754 if (desc->irq_data.affinity)
755 cpumask_copy(mask, desc->irq_data.affinity);
756 else
757 valid = false;
734 raw_spin_unlock_irq(&desc->lock); 758 raw_spin_unlock_irq(&desc->lock);
735 759
736 set_cpus_allowed_ptr(current, mask); 760 if (valid)
761 set_cpus_allowed_ptr(current, mask);
737 free_cpumask_var(mask); 762 free_cpumask_var(mask);
738} 763}
739#else 764#else
@@ -793,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
793 action = kthread_data(tsk); 818 action = kthread_data(tsk);
794 819
795 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
796 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 821 tsk->comm, tsk->pid, action->irq);
797 822
798 823
799 desc = irq_to_desc(action->irq); 824 desc = irq_to_desc(action->irq);
@@ -833,6 +858,8 @@ static int irq_thread(void *data)
833 init_task_work(&on_exit_work, irq_thread_dtor); 858 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 859 task_work_add(current, &on_exit_work, false);
835 860
861 irq_thread_check_affinity(desc, action);
862
836 while (!irq_wait_for_interrupt(action)) { 863 while (!irq_wait_for_interrupt(action)) {
837 irqreturn_t action_ret; 864 irqreturn_t action_ret;
838 865
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
936 */ 963 */
937 get_task_struct(t); 964 get_task_struct(t);
938 new->thread = t; 965 new->thread = t;
966 /*
967 * Tell the thread to set its affinity. This is
968 * important for shared interrupt handlers as we do
969 * not invoke setup_affinity() for the secondary
970 * handlers as everything is already set up. Even for
971 * interrupts marked with IRQF_NO_BALANCE this is
972 * correct as we want the thread to move to the cpu(s)
973 * on which the requesting code placed the interrupt.
974 */
975 set_bit(IRQTF_AFFINITY, &new->thread_flags);
939 } 976 }
940 977
941 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 978 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 6454db7b6a4d..9065107f083e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
74 if (!desc->irq_data.chip->irq_retrigger || 74 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 76#ifdef CONFIG_HARDIRQS_SW_RESEND
77 /*
78 * If the interrupt has a parent irq and runs
79 * in the thread context of the parent irq,
80 * retrigger the parent.
81 */
82 if (desc->parent_irq &&
83 irq_settings_is_nested_thread(desc))
84 irq = desc->parent_irq;
77 /* Set it pending and activate the softirq: */ 85 /* Set it pending and activate the softirq: */
78 set_bit(irq, irqs_resend); 86 set_bit(irq, irqs_resend);
79 tasklet_schedule(&resend_tasklet); 87 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5ec..60f48fa0fd0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
118 key->timeout = rl; 118 key->timeout = rl;
119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); 119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
120} 120}
121EXPORT_SYMBOL_GPL(jump_label_rate_limit);
121 122
122static int addr_conflict(struct jump_entry *entry, void *start, void *end) 123static int addr_conflict(struct jump_entry *entry, void *start, void *end)
123{ 124{
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/random.h> 5#include <linux/random.h>
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/ptrace.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/cache.h> 10#include <linux/cache.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58d6413..5e4bd7864c5d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 24#include <linux/utsname.h>
26#include <linux/numa.h> 25#include <linux/numa.h>
27#include <linux/suspend.h> 26#include <linux/suspend.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6f99aead66c6..0023a87e8de6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,7 @@
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
42#include <trace/events/module.h> 43#include <trace/events/module.h>
@@ -218,14 +219,16 @@ static int ____call_usermodehelper(void *data)
218 219
219 commit_creds(new); 220 commit_creds(new);
220 221
221 retval = kernel_execve(sub_info->path, 222 retval = do_execve(sub_info->path,
222 (const char *const *)sub_info->argv, 223 (const char __user *const __user *)sub_info->argv,
223 (const char *const *)sub_info->envp); 224 (const char __user *const __user *)sub_info->envp);
225 if (!retval)
226 return 0;
224 227
225 /* Exec failed? */ 228 /* Exec failed? */
226fail: 229fail:
227 sub_info->retval = retval; 230 sub_info->retval = retval;
228 return 0; 231 do_exit(0);
229} 232}
230 233
231static int call_helper(void *data) 234static int call_helper(void *data)
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data)
292 } 295 }
293 296
294 umh_complete(sub_info); 297 umh_complete(sub_info);
295 return 0; 298 do_exit(0);
296} 299}
297 300
298/* This is run by khelper thread */ 301/* This is run by khelper thread */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c62b8546cc90..098f396aa409 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
561{ 561{
562 LIST_HEAD(free_list); 562 LIST_HEAD(free_list);
563 563
564 mutex_lock(&kprobe_mutex);
564 /* Lock modules while optimizing kprobes */ 565 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex); 566 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567 567
568 /* 568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
586 /* Step 4: Free cleaned kprobes after quiesence period */ 586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list); 587 do_free_cleaned_kprobes(&free_list);
588 588
589 mutex_unlock(&kprobe_mutex);
590 mutex_unlock(&module_mutex); 589 mutex_unlock(&module_mutex);
590 mutex_unlock(&kprobe_mutex);
591 591
592 /* Step 5: Kick optimizer again if needed */ 592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
@@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
759 struct kprobe *ap; 759 struct kprobe *ap;
760 struct optimized_kprobe *op; 760 struct optimized_kprobe *op;
761 761
762 /* Impossible to optimize ftrace-based kprobe */
763 if (kprobe_ftrace(p))
764 return;
765
766 /* For preparing optimization, jump_label_text_reserved() is called */
767 jump_label_lock();
768 mutex_lock(&text_mutex);
769
762 ap = alloc_aggr_kprobe(p); 770 ap = alloc_aggr_kprobe(p);
763 if (!ap) 771 if (!ap)
764 return; 772 goto out;
765 773
766 op = container_of(ap, struct optimized_kprobe, kp); 774 op = container_of(ap, struct optimized_kprobe, kp);
767 if (!arch_prepared_optinsn(&op->optinsn)) { 775 if (!arch_prepared_optinsn(&op->optinsn)) {
768 /* If failed to setup optimizing, fallback to kprobe */ 776 /* If failed to setup optimizing, fallback to kprobe */
769 arch_remove_optimized_kprobe(op); 777 arch_remove_optimized_kprobe(op);
770 kfree(op); 778 kfree(op);
771 return; 779 goto out;
772 } 780 }
773 781
774 init_aggr_kprobe(ap, p); 782 init_aggr_kprobe(ap, p);
775 optimize_kprobe(ap); 783 optimize_kprobe(ap); /* This just kicks optimizer thread */
784
785out:
786 mutex_unlock(&text_mutex);
787 jump_label_unlock();
776} 788}
777 789
778#ifdef CONFIG_SYSCTL 790#ifdef CONFIG_SYSCTL
@@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
907} 919}
908#endif /* CONFIG_OPTPROBES */ 920#endif /* CONFIG_OPTPROBES */
909 921
922#ifdef KPROBES_CAN_USE_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS,
926};
927static int kprobe_ftrace_enabled;
928
929/* Must ensure p->addr is really on ftrace */
930static int __kprobes prepare_kprobe(struct kprobe *p)
931{
932 if (!kprobe_ftrace(p))
933 return arch_prepare_kprobe(p);
934
935 return arch_prepare_kprobe_ftrace(p);
936}
937
938/* Caller must lock kprobe_mutex */
939static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
940{
941 int ret;
942
943 ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
944 (unsigned long)p->addr, 0, 0);
945 WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
946 kprobe_ftrace_enabled++;
947 if (kprobe_ftrace_enabled == 1) {
948 ret = register_ftrace_function(&kprobe_ftrace_ops);
949 WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
950 }
951}
952
953/* Caller must lock kprobe_mutex */
954static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
955{
956 int ret;
957
958 kprobe_ftrace_enabled--;
959 if (kprobe_ftrace_enabled == 0) {
960 ret = unregister_ftrace_function(&kprobe_ftrace_ops);
961 WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
962 }
963 ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
964 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966}
967#else /* !KPROBES_CAN_USE_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0)
971#endif
972
910/* Arm a kprobe with text_mutex */ 973/* Arm a kprobe with text_mutex */
911static void __kprobes arm_kprobe(struct kprobe *kp) 974static void __kprobes arm_kprobe(struct kprobe *kp)
912{ 975{
976 if (unlikely(kprobe_ftrace(kp))) {
977 arm_kprobe_ftrace(kp);
978 return;
979 }
913 /* 980 /*
914 * Here, since __arm_kprobe() doesn't use stop_machine(), 981 * Here, since __arm_kprobe() doesn't use stop_machine(),
915 * this doesn't cause deadlock on text_mutex. So, we don't 982 * this doesn't cause deadlock on text_mutex. So, we don't
@@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
921} 988}
922 989
923/* Disarm a kprobe with text_mutex */ 990/* Disarm a kprobe with text_mutex */
924static void __kprobes disarm_kprobe(struct kprobe *kp) 991static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
925{ 992{
993 if (unlikely(kprobe_ftrace(kp))) {
994 disarm_kprobe_ftrace(kp);
995 return;
996 }
926 /* Ditto */ 997 /* Ditto */
927 mutex_lock(&text_mutex); 998 mutex_lock(&text_mutex);
928 __disarm_kprobe(kp, true); 999 __disarm_kprobe(kp, reopt);
929 mutex_unlock(&text_mutex); 1000 mutex_unlock(&text_mutex);
930} 1001}
931 1002
@@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1144 if (p->post_handler && !ap->post_handler) 1215 if (p->post_handler && !ap->post_handler)
1145 ap->post_handler = aggr_post_handler; 1216 ap->post_handler = aggr_post_handler;
1146 1217
1147 if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
1148 ap->flags &= ~KPROBE_FLAG_DISABLED;
1149 if (!kprobes_all_disarmed)
1150 /* Arm the breakpoint again. */
1151 __arm_kprobe(ap);
1152 }
1153 return 0; 1218 return 0;
1154} 1219}
1155 1220
@@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1189 int ret = 0; 1254 int ret = 0;
1190 struct kprobe *ap = orig_p; 1255 struct kprobe *ap = orig_p;
1191 1256
1257 /* For preparing optimization, jump_label_text_reserved() is called */
1258 jump_label_lock();
1259 /*
1260 * Get online CPUs to avoid text_mutex deadlock.with stop machine,
1261 * which is invoked by unoptimize_kprobe() in add_new_kprobe()
1262 */
1263 get_online_cpus();
1264 mutex_lock(&text_mutex);
1265
1192 if (!kprobe_aggrprobe(orig_p)) { 1266 if (!kprobe_aggrprobe(orig_p)) {
1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ 1267 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1194 ap = alloc_aggr_kprobe(orig_p); 1268 ap = alloc_aggr_kprobe(orig_p);
1195 if (!ap) 1269 if (!ap) {
1196 return -ENOMEM; 1270 ret = -ENOMEM;
1271 goto out;
1272 }
1197 init_aggr_kprobe(ap, orig_p); 1273 init_aggr_kprobe(ap, orig_p);
1198 } else if (kprobe_unused(ap)) 1274 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */ 1275 /* This probe is going to die. Rescue it */
@@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1213 * free aggr_probe. It will be used next time, or 1289 * free aggr_probe. It will be used next time, or
1214 * freed by unregister_kprobe. 1290 * freed by unregister_kprobe.
1215 */ 1291 */
1216 return ret; 1292 goto out;
1217 1293
1218 /* Prepare optimized instructions if possible. */ 1294 /* Prepare optimized instructions if possible. */
1219 prepare_optimized_kprobe(ap); 1295 prepare_optimized_kprobe(ap);
@@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1228 1304
1229 /* Copy ap's insn slot to p */ 1305 /* Copy ap's insn slot to p */
1230 copy_kprobe(ap, p); 1306 copy_kprobe(ap, p);
1231 return add_new_kprobe(ap, p); 1307 ret = add_new_kprobe(ap, p);
1308
1309out:
1310 mutex_unlock(&text_mutex);
1311 put_online_cpus();
1312 jump_label_unlock();
1313
1314 if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
1315 ap->flags &= ~KPROBE_FLAG_DISABLED;
1316 if (!kprobes_all_disarmed)
1317 /* Arm the breakpoint again. */
1318 arm_kprobe(ap);
1319 }
1320 return ret;
1232} 1321}
1233 1322
1234static int __kprobes in_kprobes_functions(unsigned long addr) 1323static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1313 return ret; 1402 return ret;
1314} 1403}
1315 1404
1316int __kprobes register_kprobe(struct kprobe *p) 1405static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1406 struct module **probed_mod)
1317{ 1407{
1318 int ret = 0; 1408 int ret = 0;
1319 struct kprobe *old_p; 1409 unsigned long ftrace_addr;
1320 struct module *probed_mod;
1321 kprobe_opcode_t *addr;
1322
1323 addr = kprobe_addr(p);
1324 if (IS_ERR(addr))
1325 return PTR_ERR(addr);
1326 p->addr = addr;
1327 1410
1328 ret = check_kprobe_rereg(p); 1411 /*
1329 if (ret) 1412 * If the address is located on a ftrace nop, set the
1330 return ret; 1413 * breakpoint to the following instruction.
1414 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE
1418 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */
1423 return -EINVAL;
1424#endif
1425 }
1331 1426
1332 jump_label_lock(); 1427 jump_label_lock();
1333 preempt_disable(); 1428 preempt_disable();
1429
1430 /* Ensure it is not in reserved area nor out of text */
1334 if (!kernel_text_address((unsigned long) p->addr) || 1431 if (!kernel_text_address((unsigned long) p->addr) ||
1335 in_kprobes_functions((unsigned long) p->addr) || 1432 in_kprobes_functions((unsigned long) p->addr) ||
1336 ftrace_text_reserved(p->addr, p->addr) ||
1337 jump_label_text_reserved(p->addr, p->addr)) { 1433 jump_label_text_reserved(p->addr, p->addr)) {
1338 ret = -EINVAL; 1434 ret = -EINVAL;
1339 goto cannot_probe; 1435 goto out;
1340 } 1436 }
1341 1437
1342 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1438 /* Check if are we probing a module */
1343 p->flags &= KPROBE_FLAG_DISABLED; 1439 *probed_mod = __module_text_address((unsigned long) p->addr);
1344 1440 if (*probed_mod) {
1345 /*
1346 * Check if are we probing a module.
1347 */
1348 probed_mod = __module_text_address((unsigned long) p->addr);
1349 if (probed_mod) {
1350 /* Return -ENOENT if fail. */
1351 ret = -ENOENT;
1352 /* 1441 /*
1353 * We must hold a refcount of the probed module while updating 1442 * We must hold a refcount of the probed module while updating
1354 * its code to prohibit unexpected unloading. 1443 * its code to prohibit unexpected unloading.
1355 */ 1444 */
1356 if (unlikely(!try_module_get(probed_mod))) 1445 if (unlikely(!try_module_get(*probed_mod))) {
1357 goto cannot_probe; 1446 ret = -ENOENT;
1447 goto out;
1448 }
1358 1449
1359 /* 1450 /*
1360 * If the module freed .init.text, we couldn't insert 1451 * If the module freed .init.text, we couldn't insert
1361 * kprobes in there. 1452 * kprobes in there.
1362 */ 1453 */
1363 if (within_module_init((unsigned long)p->addr, probed_mod) && 1454 if (within_module_init((unsigned long)p->addr, *probed_mod) &&
1364 probed_mod->state != MODULE_STATE_COMING) { 1455 (*probed_mod)->state != MODULE_STATE_COMING) {
1365 module_put(probed_mod); 1456 module_put(*probed_mod);
1366 goto cannot_probe; 1457 *probed_mod = NULL;
1458 ret = -ENOENT;
1367 } 1459 }
1368 /* ret will be updated by following code */
1369 } 1460 }
1461out:
1370 preempt_enable(); 1462 preempt_enable();
1371 jump_label_unlock(); 1463 jump_label_unlock();
1372 1464
1465 return ret;
1466}
1467
1468int __kprobes register_kprobe(struct kprobe *p)
1469{
1470 int ret;
1471 struct kprobe *old_p;
1472 struct module *probed_mod;
1473 kprobe_opcode_t *addr;
1474
1475 /* Adjust probe address from symbol */
1476 addr = kprobe_addr(p);
1477 if (IS_ERR(addr))
1478 return PTR_ERR(addr);
1479 p->addr = addr;
1480
1481 ret = check_kprobe_rereg(p);
1482 if (ret)
1483 return ret;
1484
1485 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1486 p->flags &= KPROBE_FLAG_DISABLED;
1373 p->nmissed = 0; 1487 p->nmissed = 0;
1374 INIT_LIST_HEAD(&p->list); 1488 INIT_LIST_HEAD(&p->list);
1375 mutex_lock(&kprobe_mutex);
1376 1489
1377 jump_label_lock(); /* needed to call jump_label_text_reserved() */ 1490 ret = check_kprobe_address_safe(p, &probed_mod);
1491 if (ret)
1492 return ret;
1378 1493
1379 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1494 mutex_lock(&kprobe_mutex);
1380 mutex_lock(&text_mutex);
1381 1495
1382 old_p = get_kprobe(p->addr); 1496 old_p = get_kprobe(p->addr);
1383 if (old_p) { 1497 if (old_p) {
@@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1386 goto out; 1500 goto out;
1387 } 1501 }
1388 1502
1389 ret = arch_prepare_kprobe(p); 1503 mutex_lock(&text_mutex); /* Avoiding text modification */
1504 ret = prepare_kprobe(p);
1505 mutex_unlock(&text_mutex);
1390 if (ret) 1506 if (ret)
1391 goto out; 1507 goto out;
1392 1508
@@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1395 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1511 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
1396 1512
1397 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1513 if (!kprobes_all_disarmed && !kprobe_disabled(p))
1398 __arm_kprobe(p); 1514 arm_kprobe(p);
1399 1515
1400 /* Try to optimize kprobe */ 1516 /* Try to optimize kprobe */
1401 try_to_optimize_kprobe(p); 1517 try_to_optimize_kprobe(p);
1402 1518
1403out: 1519out:
1404 mutex_unlock(&text_mutex);
1405 put_online_cpus();
1406 jump_label_unlock();
1407 mutex_unlock(&kprobe_mutex); 1520 mutex_unlock(&kprobe_mutex);
1408 1521
1409 if (probed_mod) 1522 if (probed_mod)
1410 module_put(probed_mod); 1523 module_put(probed_mod);
1411 1524
1412 return ret; 1525 return ret;
1413
1414cannot_probe:
1415 preempt_enable();
1416 jump_label_unlock();
1417 return ret;
1418} 1526}
1419EXPORT_SYMBOL_GPL(register_kprobe); 1527EXPORT_SYMBOL_GPL(register_kprobe);
1420 1528
@@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1451 1559
1452 /* Try to disarm and disable this/parent probe */ 1560 /* Try to disarm and disable this/parent probe */
1453 if (p == orig_p || aggr_kprobe_disabled(orig_p)) { 1561 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1454 disarm_kprobe(orig_p); 1562 disarm_kprobe(orig_p, true);
1455 orig_p->flags |= KPROBE_FLAG_DISABLED; 1563 orig_p->flags |= KPROBE_FLAG_DISABLED;
1456 } 1564 }
1457 } 1565 }
@@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
2049 2157
2050 if (!pp) 2158 if (!pp)
2051 pp = p; 2159 pp = p;
2052 seq_printf(pi, "%s%s%s\n", 2160 seq_printf(pi, "%s%s%s%s\n",
2053 (kprobe_gone(p) ? "[GONE]" : ""), 2161 (kprobe_gone(p) ? "[GONE]" : ""),
2054 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), 2162 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
2055 (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); 2163 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
2164 (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
2056} 2165}
2057 2166
2058static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 2167static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void)
2131 goto already_enabled; 2240 goto already_enabled;
2132 2241
2133 /* Arming kprobes doesn't optimize kprobe itself */ 2242 /* Arming kprobes doesn't optimize kprobe itself */
2134 mutex_lock(&text_mutex);
2135 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2243 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2136 head = &kprobe_table[i]; 2244 head = &kprobe_table[i];
2137 hlist_for_each_entry_rcu(p, node, head, hlist) 2245 hlist_for_each_entry_rcu(p, node, head, hlist)
2138 if (!kprobe_disabled(p)) 2246 if (!kprobe_disabled(p))
2139 __arm_kprobe(p); 2247 arm_kprobe(p);
2140 } 2248 }
2141 mutex_unlock(&text_mutex);
2142 2249
2143 kprobes_all_disarmed = false; 2250 kprobes_all_disarmed = false;
2144 printk(KERN_INFO "Kprobes globally enabled\n"); 2251 printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void)
2166 kprobes_all_disarmed = true; 2273 kprobes_all_disarmed = true;
2167 printk(KERN_INFO "Kprobes globally disabled\n"); 2274 printk(KERN_INFO "Kprobes globally disabled\n");
2168 2275
2169 mutex_lock(&text_mutex);
2170 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2171 head = &kprobe_table[i]; 2277 head = &kprobe_table[i];
2172 hlist_for_each_entry_rcu(p, node, head, hlist) { 2278 hlist_for_each_entry_rcu(p, node, head, hlist) {
2173 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2174 __disarm_kprobe(p, false); 2280 disarm_kprobe(p, false);
2175 } 2281 }
2176 } 2282 }
2177 mutex_unlock(&text_mutex);
2178 mutex_unlock(&kprobe_mutex); 2283 mutex_unlock(&kprobe_mutex);
2179 2284
2180 /* Wait for disarming all kprobes by optimizer */ 2285 /* Wait for disarming all kprobes by optimizer */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..6ada93c23a9a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
26static struct kobj_attribute _name##_attr = \ 26static struct kobj_attribute _name##_attr = \
27 __ATTR(_name, 0644, _name##_show, _name##_store) 27 __ATTR(_name, 0644, _name##_show, _name##_store)
28 28
29#if defined(CONFIG_HOTPLUG)
30/* current uevent sequence number */ 29/* current uevent sequence number */
31static ssize_t uevent_seqnum_show(struct kobject *kobj, 30static ssize_t uevent_seqnum_show(struct kobject *kobj,
32 struct kobj_attribute *attr, char *buf) 31 struct kobj_attribute *attr, char *buf)
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
54 return count; 53 return count;
55} 54}
56KERNEL_ATTR_RW(uevent_helper); 55KERNEL_ATTR_RW(uevent_helper);
57#endif 56
58 57
59#ifdef CONFIG_PROFILING 58#ifdef CONFIG_PROFILING
60static ssize_t profiling_show(struct kobject *kobj, 59static ssize_t profiling_show(struct kobject *kobj,
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 140}
142KERNEL_ATTR_RO(fscaps); 141KERNEL_ATTR_RO(fscaps);
143 142
143int rcu_expedited;
144static ssize_t rcu_expedited_show(struct kobject *kobj,
145 struct kobj_attribute *attr, char *buf)
146{
147 return sprintf(buf, "%d\n", rcu_expedited);
148}
149static ssize_t rcu_expedited_store(struct kobject *kobj,
150 struct kobj_attribute *attr,
151 const char *buf, size_t count)
152{
153 if (kstrtoint(buf, 0, &rcu_expedited))
154 return -EINVAL;
155
156 return count;
157}
158KERNEL_ATTR_RW(rcu_expedited);
159
144/* 160/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 161 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 162 */
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
169 185
170static struct attribute * kernel_attrs[] = { 186static struct attribute * kernel_attrs[] = {
171 &fscaps_attr.attr, 187 &fscaps_attr.attr,
172#if defined(CONFIG_HOTPLUG)
173 &uevent_seqnum_attr.attr, 188 &uevent_seqnum_attr.attr,
174 &uevent_helper_attr.attr, 189 &uevent_helper_attr.attr,
175#endif
176#ifdef CONFIG_PROFILING 190#ifdef CONFIG_PROFILING
177 &profiling_attr.attr, 191 &profiling_attr.attr,
178#endif 192#endif
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 196 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 197 &vmcoreinfo_attr.attr,
184#endif 198#endif
199 &rcu_expedited_attr.attr,
185 NULL 200 NULL
186}; 201};
187 202
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b579af57ea10..691dc2ef9baf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h>
19#include <trace/events/sched.h> 20#include <trace/events/sched.h>
20 21
21static DEFINE_SPINLOCK(kthread_create_lock); 22static DEFINE_SPINLOCK(kthread_create_lock);
@@ -37,11 +38,20 @@ struct kthread_create_info
37}; 38};
38 39
39struct kthread { 40struct kthread {
40 int should_stop; 41 unsigned long flags;
42 unsigned int cpu;
41 void *data; 43 void *data;
44 struct completion parked;
42 struct completion exited; 45 struct completion exited;
43}; 46};
44 47
48enum KTHREAD_BITS {
49 KTHREAD_IS_PER_CPU = 0,
50 KTHREAD_SHOULD_STOP,
51 KTHREAD_SHOULD_PARK,
52 KTHREAD_IS_PARKED,
53};
54
45#define to_kthread(tsk) \ 55#define to_kthread(tsk) \
46 container_of((tsk)->vfork_done, struct kthread, exited) 56 container_of((tsk)->vfork_done, struct kthread, exited)
47 57
@@ -52,13 +62,29 @@ struct kthread {
52 * and this will return true. You should then return, and your return 62 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop(). 63 * value will be passed through to kthread_stop().
54 */ 64 */
55int kthread_should_stop(void) 65bool kthread_should_stop(void)
56{ 66{
57 return to_kthread(current)->should_stop; 67 return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
58} 68}
59EXPORT_SYMBOL(kthread_should_stop); 69EXPORT_SYMBOL(kthread_should_stop);
60 70
61/** 71/**
72 * kthread_should_park - should this kthread park now?
73 *
74 * When someone calls kthread_park() on your kthread, it will be woken
75 * and this will return true. You should then do the necessary
76 * cleanup and call kthread_parkme()
77 *
78 * Similar to kthread_should_stop(), but this keeps the thread alive
79 * and in a park position. kthread_unpark() "restarts" the thread and
80 * calls the thread function again.
81 */
82bool kthread_should_park(void)
83{
84 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
85}
86
87/**
62 * kthread_freezable_should_stop - should this freezable kthread return now? 88 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen 89 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 * 90 *
@@ -96,6 +122,24 @@ void *kthread_data(struct task_struct *task)
96 return to_kthread(task)->data; 122 return to_kthread(task)->data;
97} 123}
98 124
125static void __kthread_parkme(struct kthread *self)
126{
127 __set_current_state(TASK_INTERRUPTIBLE);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked);
131 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE);
133 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING);
136}
137
138void kthread_parkme(void)
139{
140 __kthread_parkme(to_kthread(current));
141}
142
99static int kthread(void *_create) 143static int kthread(void *_create)
100{ 144{
101 /* Copy data: it's on kthread's stack */ 145 /* Copy data: it's on kthread's stack */
@@ -105,9 +149,10 @@ static int kthread(void *_create)
105 struct kthread self; 149 struct kthread self;
106 int ret; 150 int ret;
107 151
108 self.should_stop = 0; 152 self.flags = 0;
109 self.data = data; 153 self.data = data;
110 init_completion(&self.exited); 154 init_completion(&self.exited);
155 init_completion(&self.parked);
111 current->vfork_done = &self.exited; 156 current->vfork_done = &self.exited;
112 157
113 /* OK, tell user we're spawned, wait for stop or wakeup */ 158 /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +162,11 @@ static int kthread(void *_create)
117 schedule(); 162 schedule();
118 163
119 ret = -EINTR; 164 ret = -EINTR;
120 if (!self.should_stop)
121 ret = threadfn(data);
122 165
166 if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
167 __kthread_parkme(&self);
168 ret = threadfn(data);
169 }
123 /* we can't just return, we must preserve "self" on stack */ 170 /* we can't just return, we must preserve "self" on stack */
124 do_exit(ret); 171 do_exit(ret);
125} 172}
@@ -172,8 +219,7 @@ static void create_kthread(struct kthread_create_info *create)
172 * Returns a task_struct or ERR_PTR(-ENOMEM). 219 * Returns a task_struct or ERR_PTR(-ENOMEM).
173 */ 220 */
174struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 221struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
175 void *data, 222 void *data, int node,
176 int node,
177 const char namefmt[], 223 const char namefmt[],
178 ...) 224 ...)
179{ 225{
@@ -210,6 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
210} 256}
211EXPORT_SYMBOL(kthread_create_on_node); 257EXPORT_SYMBOL(kthread_create_on_node);
212 258
259static void __kthread_bind(struct task_struct *p, unsigned int cpu)
260{
261 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND;
264}
265
213/** 266/**
214 * kthread_bind - bind a just-created kthread to a cpu. 267 * kthread_bind - bind a just-created kthread to a cpu.
215 * @p: thread created by kthread_create(). 268 * @p: thread created by kthread_create().
@@ -226,14 +279,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
226 WARN_ON(1); 279 WARN_ON(1);
227 return; 280 return;
228 } 281 }
229 282 __kthread_bind(p, cpu);
230 /* It's safe because the task is inactive. */
231 do_set_cpus_allowed(p, cpumask_of(cpu));
232 p->flags |= PF_THREAD_BOUND;
233} 283}
234EXPORT_SYMBOL(kthread_bind); 284EXPORT_SYMBOL(kthread_bind);
235 285
236/** 286/**
287 * kthread_create_on_cpu - Create a cpu bound kthread
288 * @threadfn: the function to run until signal_pending(current).
289 * @data: data ptr for @threadfn.
290 * @cpu: The cpu on which the thread should be bound,
291 * @namefmt: printf-style name for the thread. Format is restricted
292 * to "name.*%u". Code fills in cpu number.
293 *
294 * Description: This helper function creates and names a kernel thread
295 * The thread will be woken and put into park mode.
296 */
297struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
298 void *data, unsigned int cpu,
299 const char *namefmt)
300{
301 struct task_struct *p;
302
303 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
304 cpu);
305 if (IS_ERR(p))
306 return p;
307 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
308 to_kthread(p)->cpu = cpu;
309 /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
310 kthread_park(p);
311 return p;
312}
313
314static struct kthread *task_get_live_kthread(struct task_struct *k)
315{
316 struct kthread *kthread;
317
318 get_task_struct(k);
319 kthread = to_kthread(k);
320 /* It might have exited */
321 barrier();
322 if (k->vfork_done != NULL)
323 return kthread;
324 return NULL;
325}
326
327/**
328 * kthread_unpark - unpark a thread created by kthread_create().
329 * @k: thread created by kthread_create().
330 *
331 * Sets kthread_should_park() for @k to return false, wakes it, and
332 * waits for it to return. If the thread is marked percpu then its
333 * bound to the cpu again.
334 */
335void kthread_unpark(struct task_struct *k)
336{
337 struct kthread *kthread = task_get_live_kthread(k);
338
339 if (kthread) {
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k);
354}
355
356/**
357 * kthread_park - park a thread created by kthread_create().
358 * @k: thread created by kthread_create().
359 *
360 * Sets kthread_should_park() for @k to return true, wakes it, and
361 * waits for it to return. This can also be called after kthread_create()
362 * instead of calling wake_up_process(): the thread will park without
363 * calling threadfn().
364 *
365 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
366 * If called by the kthread itself just the park bit is set.
367 */
368int kthread_park(struct task_struct *k)
369{
370 struct kthread *kthread = task_get_live_kthread(k);
371 int ret = -ENOSYS;
372
373 if (kthread) {
374 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
375 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
376 if (k != current) {
377 wake_up_process(k);
378 wait_for_completion(&kthread->parked);
379 }
380 }
381 ret = 0;
382 }
383 put_task_struct(k);
384 return ret;
385}
386
387/**
237 * kthread_stop - stop a thread created by kthread_create(). 388 * kthread_stop - stop a thread created by kthread_create().
238 * @k: thread created by kthread_create(). 389 * @k: thread created by kthread_create().
239 * 390 *
@@ -250,16 +401,13 @@ EXPORT_SYMBOL(kthread_bind);
250 */ 401 */
251int kthread_stop(struct task_struct *k) 402int kthread_stop(struct task_struct *k)
252{ 403{
253 struct kthread *kthread; 404 struct kthread *kthread = task_get_live_kthread(k);
254 int ret; 405 int ret;
255 406
256 trace_sched_kthread_stop(k); 407 trace_sched_kthread_stop(k);
257 get_task_struct(k); 408 if (kthread) {
258 409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
259 kthread = to_kthread(k); 410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
260 barrier(); /* it might have exited */
261 if (k->vfork_done != NULL) {
262 kthread->should_stop = 1;
263 wake_up_process(k); 411 wake_up_process(k);
264 wait_for_completion(&kthread->exited); 412 wait_for_completion(&kthread->exited);
265 } 413 }
@@ -280,7 +428,7 @@ int kthreadd(void *unused)
280 set_task_comm(tsk, "kthreadd"); 428 set_task_comm(tsk, "kthreadd");
281 ignore_signals(tsk); 429 ignore_signals(tsk);
282 set_cpus_allowed_ptr(tsk, cpu_all_mask); 430 set_cpus_allowed_ptr(tsk, cpu_all_mask);
283 set_mems_allowed(node_states[N_HIGH_MEMORY]); 431 set_mems_allowed(node_states[N_MEMORY]);
284 432
285 current->flags |= PF_NOFREEZE; 433 current->flags |= PF_NOFREEZE;
286 434
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ea9ee4518c35..7981e5b2350d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2998 2998
2999struct lock_class_key __lockdep_no_validate__; 2999struct lock_class_key __lockdep_no_validate__;
3000 3000
3001static int
3002print_lock_nested_lock_not_held(struct task_struct *curr,
3003 struct held_lock *hlock,
3004 unsigned long ip)
3005{
3006 if (!debug_locks_off())
3007 return 0;
3008 if (debug_locks_silent)
3009 return 0;
3010
3011 printk("\n");
3012 printk("==================================\n");
3013 printk("[ BUG: Nested lock was not taken ]\n");
3014 print_kernel_ident();
3015 printk("----------------------------------\n");
3016
3017 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3018 print_lock(hlock);
3019
3020 printk("\nbut this task is not holding:\n");
3021 printk("%s\n", hlock->nest_lock->name);
3022
3023 printk("\nstack backtrace:\n");
3024 dump_stack();
3025
3026 printk("\nother info that might help us debug this:\n");
3027 lockdep_print_held_locks(curr);
3028
3029 printk("\nstack backtrace:\n");
3030 dump_stack();
3031
3032 return 0;
3033}
3034
3035static int __lock_is_held(struct lockdep_map *lock);
3036
3001/* 3037/*
3002 * This gets called for every mutex_lock*()/spin_lock*() operation. 3038 * This gets called for every mutex_lock*()/spin_lock*() operation.
3003 * We maintain the dependency maps and validate the locking attempt: 3039 * We maintain the dependency maps and validate the locking attempt:
@@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3139 } 3175 }
3140 chain_key = iterate_chain_key(chain_key, id); 3176 chain_key = iterate_chain_key(chain_key, id);
3141 3177
3178 if (nest_lock && !__lock_is_held(nest_lock))
3179 return print_lock_nested_lock_not_held(curr, hlock, ip);
3180
3142 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3181 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
3143 return 0; 3182 return 0;
3144 3183
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 91c32a0b612c..b2c71c5873e4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
39 39
40static void print_name(struct seq_file *m, struct lock_class *class) 40static void print_name(struct seq_file *m, struct lock_class *class)
41{ 41{
42 char str[128]; 42 char str[KSYM_NAME_LEN];
43 const char *name = class->name; 43 const char *name = class->name;
44 44
45 if (!name) { 45 if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9
10#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \
12 ASM_SYMBOL(name):
13
14 .section ".init.data","aw"
15
16GLOBAL(modsign_certificate_list)
17 .incbin "signing_key.x509"
18 .incbin "extra_certificates"
19GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 000000000000..2b6e69909c39
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,104 @@
1/* Public keys for module signature verification
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/cred.h>
15#include <linux/err.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19struct key *modsign_keyring;
20
21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[];
23
24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes.
27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo";
29
30/*
31 * Load the compiled-in keys
32 */
33static __init int module_verify_init(void)
34{
35 pr_notice("Initialise module verification\n");
36
37 modsign_keyring = keyring_alloc(".module_sign",
38 KUIDT_INIT(0), KGIDT_INIT(0),
39 current_cred(),
40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
41 KEY_USR_VIEW | KEY_USR_READ),
42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
43 if (IS_ERR(modsign_keyring))
44 panic("Can't allocate module signing keyring\n");
45
46 return 0;
47}
48
49/*
50 * Must be initialised before we try and load the keys into the keyring.
51 */
52device_initcall(module_verify_init);
53
54/*
55 * Load the compiled-in keys
56 */
57static __init int load_module_signing_keys(void)
58{
59 key_ref_t key;
60 const u8 *p, *end;
61 size_t plen;
62
63 pr_notice("Loading module verification certificates\n");
64
65 end = modsign_certificate_list_end;
66 p = modsign_certificate_list;
67 while (p < end) {
68 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
69 * than 256 bytes in size.
70 */
71 if (end - p < 4)
72 goto dodgy_cert;
73 if (p[0] != 0x30 &&
74 p[1] != 0x82)
75 goto dodgy_cert;
76 plen = (p[2] << 8) | p[3];
77 plen += 4;
78 if (plen > end - p)
79 goto dodgy_cert;
80
81 key = key_create_or_update(make_key_ref(modsign_keyring, 1),
82 "asymmetric",
83 NULL,
84 p,
85 plen,
86 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
87 KEY_USR_VIEW,
88 KEY_ALLOC_NOT_IN_QUOTA);
89 if (IS_ERR(key))
90 pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
91 PTR_ERR(key));
92 else
93 pr_notice("MODSIGN: Loaded cert '%s'\n",
94 key_ref_to_ptr(key)->description);
95 p += plen;
96 }
97
98 return 0;
99
100dodgy_cert:
101 pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
102 return 0;
103}
104late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000000..24f9247b7d02
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,14 @@
1/* Module internals
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12extern struct key *modsign_keyring;
13
14extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 4edbd9c11aca..250092c1d57d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/file.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/sysfs.h> 26#include <linux/sysfs.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
@@ -28,6 +29,7 @@
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/elf.h> 30#include <linux/elf.h>
30#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
32#include <linux/security.h>
31#include <linux/seq_file.h> 33#include <linux/seq_file.h>
32#include <linux/syscalls.h> 34#include <linux/syscalls.h>
33#include <linux/fcntl.h> 35#include <linux/fcntl.h>
@@ -58,6 +60,9 @@
58#include <linux/jump_label.h> 60#include <linux/jump_label.h>
59#include <linux/pfn.h> 61#include <linux/pfn.h>
60#include <linux/bsearch.h> 62#include <linux/bsearch.h>
63#include <linux/fips.h>
64#include <uapi/linux/module.h>
65#include "module-internal.h"
61 66
62#define CREATE_TRACE_POINTS 67#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 68#include <trace/events/module.h>
@@ -102,6 +107,43 @@ static LIST_HEAD(modules);
102struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ 107struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
103#endif /* CONFIG_KGDB_KDB */ 108#endif /* CONFIG_KGDB_KDB */
104 109
110#ifdef CONFIG_MODULE_SIG
111#ifdef CONFIG_MODULE_SIG_FORCE
112static bool sig_enforce = true;
113#else
114static bool sig_enforce = false;
115
116static int param_set_bool_enable_only(const char *val,
117 const struct kernel_param *kp)
118{
119 int err;
120 bool test;
121 struct kernel_param dummy_kp = *kp;
122
123 dummy_kp.arg = &test;
124
125 err = param_set_bool(val, &dummy_kp);
126 if (err)
127 return err;
128
129 /* Don't let them unset it once it's set! */
130 if (!test && sig_enforce)
131 return -EROFS;
132
133 if (test)
134 sig_enforce = true;
135 return 0;
136}
137
138static const struct kernel_param_ops param_ops_bool_enable_only = {
139 .set = param_set_bool_enable_only,
140 .get = param_get_bool,
141};
142#define param_check_bool_enable_only param_check_bool
143
144module_param(sig_enforce, bool_enable_only, 0644);
145#endif /* !CONFIG_MODULE_SIG_FORCE */
146#endif /* CONFIG_MODULE_SIG */
105 147
106/* Block module loading/unloading? */ 148/* Block module loading/unloading? */
107int modules_disabled = 0; 149int modules_disabled = 0;
@@ -136,6 +178,7 @@ struct load_info {
136 unsigned long symoffs, stroffs; 178 unsigned long symoffs, stroffs;
137 struct _ddebug *debug; 179 struct _ddebug *debug;
138 unsigned int num_debug; 180 unsigned int num_debug;
181 bool sig_ok;
139 struct { 182 struct {
140 unsigned int sym, str, mod, vers, info, pcpu; 183 unsigned int sym, str, mod, vers, info, pcpu;
141 } index; 184 } index;
@@ -332,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms,
332 printk(KERN_WARNING "Symbol %s is being used " 375 printk(KERN_WARNING "Symbol %s is being used "
333 "by a non-GPL module, which will not " 376 "by a non-GPL module, which will not "
334 "be allowed in the future\n", fsa->name); 377 "be allowed in the future\n", fsa->name);
335 printk(KERN_WARNING "Please see the file "
336 "Documentation/feature-removal-schedule.txt "
337 "in the kernel source tree for more details.\n");
338 } 378 }
339 } 379 }
340 380
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1949 return ret; 1989 return ret;
1950} 1990}
1951 1991
1952int __weak apply_relocate(Elf_Shdr *sechdrs,
1953 const char *strtab,
1954 unsigned int symindex,
1955 unsigned int relsec,
1956 struct module *me)
1957{
1958 pr_err("module %s: REL relocation unsupported\n", me->name);
1959 return -ENOEXEC;
1960}
1961
1962int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1963 const char *strtab,
1964 unsigned int symindex,
1965 unsigned int relsec,
1966 struct module *me)
1967{
1968 pr_err("module %s: RELA relocation unsupported\n", me->name);
1969 return -ENOEXEC;
1970}
1971
1972static int apply_relocations(struct module *mod, const struct load_info *info) 1992static int apply_relocations(struct module *mod, const struct load_info *info)
1973{ 1993{
1974 unsigned int i; 1994 unsigned int i;
@@ -2262,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2262 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2263 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2283 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2264 const Elf_Sym *src; 2284 const Elf_Sym *src;
2265 unsigned int i, nsrc, ndst, strtab_size; 2285 unsigned int i, nsrc, ndst, strtab_size = 0;
2266 2286
2267 /* Put symbol section at end of init part of module. */ 2287 /* Put symbol section at end of init part of module. */
2268 symsect->sh_flags |= SHF_ALLOC; 2288 symsect->sh_flags |= SHF_ALLOC;
@@ -2274,11 +2294,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2274 nsrc = symsect->sh_size / sizeof(*src); 2294 nsrc = symsect->sh_size / sizeof(*src);
2275 2295
2276 /* Compute total space required for the core symbols' strtab. */ 2296 /* Compute total space required for the core symbols' strtab. */
2277 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) 2297 for (ndst = i = 0; i < nsrc; i++) {
2278 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { 2298 if (i == 0 ||
2279 strtab_size += strlen(&info->strtab[src->st_name]) + 1; 2299 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2300 strtab_size += strlen(&info->strtab[src[i].st_name])+1;
2280 ndst++; 2301 ndst++;
2281 } 2302 }
2303 }
2282 2304
2283 /* Append room for core symbols at end of core part. */ 2305 /* Append room for core symbols at end of core part. */
2284 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2306 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2312,15 +2334,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2312 mod->core_symtab = dst = mod->module_core + info->symoffs; 2334 mod->core_symtab = dst = mod->module_core + info->symoffs;
2313 mod->core_strtab = s = mod->module_core + info->stroffs; 2335 mod->core_strtab = s = mod->module_core + info->stroffs;
2314 src = mod->symtab; 2336 src = mod->symtab;
2315 *dst = *src; 2337 for (ndst = i = 0; i < mod->num_symtab; i++) {
2316 *s++ = 0; 2338 if (i == 0 ||
2317 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2339 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2318 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2340 dst[ndst] = src[i];
2319 continue; 2341 dst[ndst++].st_name = s - mod->core_strtab;
2320 2342 s += strlcpy(s, &mod->strtab[src[i].st_name],
2321 dst[ndst] = *src; 2343 KSYM_NAME_LEN) + 1;
2322 dst[ndst++].st_name = s - mod->core_strtab; 2344 }
2323 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2324 } 2345 }
2325 mod->core_num_syms = ndst; 2346 mod->core_num_syms = ndst;
2326} 2347}
@@ -2353,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2353 2374
2354void * __weak module_alloc(unsigned long size) 2375void * __weak module_alloc(unsigned long size)
2355{ 2376{
2356 return size == 0 ? NULL : vmalloc_exec(size); 2377 return vmalloc_exec(size);
2357} 2378}
2358 2379
2359static void *module_alloc_update_bounds(unsigned long size) 2380static void *module_alloc_update_bounds(unsigned long size)
@@ -2399,48 +2420,136 @@ static inline void kmemleak_load_module(const struct module *mod,
2399} 2420}
2400#endif 2421#endif
2401 2422
2423#ifdef CONFIG_MODULE_SIG
2424static int module_sig_check(struct load_info *info)
2425{
2426 int err = -ENOKEY;
2427 const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2428 const void *mod = info->hdr;
2429
2430 if (info->len > markerlen &&
2431 memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2432 /* We truncate the module to discard the signature */
2433 info->len -= markerlen;
2434 err = mod_verify_sig(mod, &info->len);
2435 }
2436
2437 if (!err) {
2438 info->sig_ok = true;
2439 return 0;
2440 }
2441
2442 /* Not having a signature is only an error if we're strict. */
2443 if (err < 0 && fips_enabled)
2444 panic("Module verification failed with error %d in FIPS mode\n",
2445 err);
2446 if (err == -ENOKEY && !sig_enforce)
2447 err = 0;
2448
2449 return err;
2450}
2451#else /* !CONFIG_MODULE_SIG */
2452static int module_sig_check(struct load_info *info)
2453{
2454 return 0;
2455}
2456#endif /* !CONFIG_MODULE_SIG */
2457
2458/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
2459static int elf_header_check(struct load_info *info)
2460{
2461 if (info->len < sizeof(*(info->hdr)))
2462 return -ENOEXEC;
2463
2464 if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
2465 || info->hdr->e_type != ET_REL
2466 || !elf_check_arch(info->hdr)
2467 || info->hdr->e_shentsize != sizeof(Elf_Shdr))
2468 return -ENOEXEC;
2469
2470 if (info->hdr->e_shoff >= info->len
2471 || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
2472 info->len - info->hdr->e_shoff))
2473 return -ENOEXEC;
2474
2475 return 0;
2476}
2477
2402/* Sets info->hdr and info->len. */ 2478/* Sets info->hdr and info->len. */
2403static int copy_and_check(struct load_info *info, 2479static int copy_module_from_user(const void __user *umod, unsigned long len,
2404 const void __user *umod, unsigned long len, 2480 struct load_info *info)
2405 const char __user *uargs)
2406{ 2481{
2407 int err; 2482 int err;
2408 Elf_Ehdr *hdr;
2409 2483
2410 if (len < sizeof(*hdr)) 2484 info->len = len;
2485 if (info->len < sizeof(*(info->hdr)))
2411 return -ENOEXEC; 2486 return -ENOEXEC;
2412 2487
2488 err = security_kernel_module_from_file(NULL);
2489 if (err)
2490 return err;
2491
2413 /* Suck in entire file: we'll want most of it. */ 2492 /* Suck in entire file: we'll want most of it. */
2414 if ((hdr = vmalloc(len)) == NULL) 2493 info->hdr = vmalloc(info->len);
2494 if (!info->hdr)
2415 return -ENOMEM; 2495 return -ENOMEM;
2416 2496
2417 if (copy_from_user(hdr, umod, len) != 0) { 2497 if (copy_from_user(info->hdr, umod, info->len) != 0) {
2418 err = -EFAULT; 2498 vfree(info->hdr);
2419 goto free_hdr; 2499 return -EFAULT;
2420 } 2500 }
2421 2501
2422 /* Sanity checks against insmoding binaries or wrong arch, 2502 return 0;
2423 weird elf version */ 2503}
2424 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2504
2425 || hdr->e_type != ET_REL 2505/* Sets info->hdr and info->len. */
2426 || !elf_check_arch(hdr) 2506static int copy_module_from_fd(int fd, struct load_info *info)
2427 || hdr->e_shentsize != sizeof(Elf_Shdr)) { 2507{
2428 err = -ENOEXEC; 2508 struct file *file;
2429 goto free_hdr; 2509 int err;
2430 } 2510 struct kstat stat;
2511 loff_t pos;
2512 ssize_t bytes = 0;
2513
2514 file = fget(fd);
2515 if (!file)
2516 return -ENOEXEC;
2431 2517
2432 if (hdr->e_shoff >= len || 2518 err = security_kernel_module_from_file(file);
2433 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { 2519 if (err)
2434 err = -ENOEXEC; 2520 goto out;
2435 goto free_hdr; 2521
2522 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
2523 if (err)
2524 goto out;
2525
2526 if (stat.size > INT_MAX) {
2527 err = -EFBIG;
2528 goto out;
2529 }
2530 info->hdr = vmalloc(stat.size);
2531 if (!info->hdr) {
2532 err = -ENOMEM;
2533 goto out;
2436 } 2534 }
2437 2535
2438 info->hdr = hdr; 2536 pos = 0;
2439 info->len = len; 2537 while (pos < stat.size) {
2440 return 0; 2538 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
2539 stat.size - pos);
2540 if (bytes < 0) {
2541 vfree(info->hdr);
2542 err = bytes;
2543 goto out;
2544 }
2545 if (bytes == 0)
2546 break;
2547 pos += bytes;
2548 }
2549 info->len = pos;
2441 2550
2442free_hdr: 2551out:
2443 vfree(hdr); 2552 fput(file);
2444 return err; 2553 return err;
2445} 2554}
2446 2555
@@ -2449,7 +2558,7 @@ static void free_copy(struct load_info *info)
2449 vfree(info->hdr); 2558 vfree(info->hdr);
2450} 2559}
2451 2560
2452static int rewrite_section_headers(struct load_info *info) 2561static int rewrite_section_headers(struct load_info *info, int flags)
2453{ 2562{
2454 unsigned int i; 2563 unsigned int i;
2455 2564
@@ -2477,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info)
2477 } 2586 }
2478 2587
2479 /* Track but don't keep modinfo and version sections. */ 2588 /* Track but don't keep modinfo and version sections. */
2480 info->index.vers = find_sec(info, "__versions"); 2589 if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
2590 info->index.vers = 0; /* Pretend no __versions section! */
2591 else
2592 info->index.vers = find_sec(info, "__versions");
2481 info->index.info = find_sec(info, ".modinfo"); 2593 info->index.info = find_sec(info, ".modinfo");
2482 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2594 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2483 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2595 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2492,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info)
2492 * Return the temporary module pointer (we'll replace it with the final 2604 * Return the temporary module pointer (we'll replace it with the final
2493 * one when we move the module sections around). 2605 * one when we move the module sections around).
2494 */ 2606 */
2495static struct module *setup_load_info(struct load_info *info) 2607static struct module *setup_load_info(struct load_info *info, int flags)
2496{ 2608{
2497 unsigned int i; 2609 unsigned int i;
2498 int err; 2610 int err;
@@ -2503,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info)
2503 info->secstrings = (void *)info->hdr 2615 info->secstrings = (void *)info->hdr
2504 + info->sechdrs[info->hdr->e_shstrndx].sh_offset; 2616 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2505 2617
2506 err = rewrite_section_headers(info); 2618 err = rewrite_section_headers(info, flags);
2507 if (err) 2619 if (err)
2508 return ERR_PTR(err); 2620 return ERR_PTR(err);
2509 2621
@@ -2541,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info)
2541 return mod; 2653 return mod;
2542} 2654}
2543 2655
2544static int check_modinfo(struct module *mod, struct load_info *info) 2656static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2545{ 2657{
2546 const char *modmagic = get_modinfo(info, "vermagic"); 2658 const char *modmagic = get_modinfo(info, "vermagic");
2547 int err; 2659 int err;
2548 2660
2661 if (flags & MODULE_INIT_IGNORE_VERMAGIC)
2662 modmagic = NULL;
2663
2549 /* This is allowed: modprobe --force will invalidate it. */ 2664 /* This is allowed: modprobe --force will invalidate it. */
2550 if (!modmagic) { 2665 if (!modmagic) {
2551 err = try_to_force_load(mod, "bad vermagic"); 2666 err = try_to_force_load(mod, "bad vermagic");
@@ -2675,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info)
2675 memset(ptr, 0, mod->core_size); 2790 memset(ptr, 0, mod->core_size);
2676 mod->module_core = ptr; 2791 mod->module_core = ptr;
2677 2792
2678 ptr = module_alloc_update_bounds(mod->init_size); 2793 if (mod->init_size) {
2679 /* 2794 ptr = module_alloc_update_bounds(mod->init_size);
2680 * The pointer to this block is stored in the module structure 2795 /*
2681 * which is inside the block. This block doesn't need to be 2796 * The pointer to this block is stored in the module structure
2682 * scanned as it contains data and code that will be freed 2797 * which is inside the block. This block doesn't need to be
2683 * after the module is initialized. 2798 * scanned as it contains data and code that will be freed
2684 */ 2799 * after the module is initialized.
2685 kmemleak_ignore(ptr); 2800 */
2686 if (!ptr && mod->init_size) { 2801 kmemleak_ignore(ptr);
2687 module_free(mod, mod->module_core); 2802 if (!ptr) {
2688 return -ENOMEM; 2803 module_free(mod, mod->module_core);
2689 } 2804 return -ENOMEM;
2690 memset(ptr, 0, mod->init_size); 2805 }
2691 mod->module_init = ptr; 2806 memset(ptr, 0, mod->init_size);
2807 mod->module_init = ptr;
2808 } else
2809 mod->module_init = NULL;
2692 2810
2693 /* Transfer each section which specifies SHF_ALLOC */ 2811 /* Transfer each section which specifies SHF_ALLOC */
2694 pr_debug("final section addresses:\n"); 2812 pr_debug("final section addresses:\n");
@@ -2730,6 +2848,10 @@ static int check_module_license_and_versions(struct module *mod)
2730 if (strcmp(mod->name, "driverloader") == 0) 2848 if (strcmp(mod->name, "driverloader") == 0)
2731 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2849 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2732 2850
2851 /* lve claims to be GPL but upstream won't provide source */
2852 if (strcmp(mod->name, "lve") == 0)
2853 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2854
2733#ifdef CONFIG_MODVERSIONS 2855#ifdef CONFIG_MODVERSIONS
2734 if ((mod->num_syms && !mod->crcs) 2856 if ((mod->num_syms && !mod->crcs)
2735 || (mod->num_gpl_syms && !mod->gpl_crcs) 2857 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2777,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2777 return 0; 2899 return 0;
2778} 2900}
2779 2901
2780static struct module *layout_and_allocate(struct load_info *info) 2902static struct module *layout_and_allocate(struct load_info *info, int flags)
2781{ 2903{
2782 /* Module within temporary copy. */ 2904 /* Module within temporary copy. */
2783 struct module *mod; 2905 struct module *mod;
2784 Elf_Shdr *pcpusec; 2906 Elf_Shdr *pcpusec;
2785 int err; 2907 int err;
2786 2908
2787 mod = setup_load_info(info); 2909 mod = setup_load_info(info, flags);
2788 if (IS_ERR(mod)) 2910 if (IS_ERR(mod))
2789 return mod; 2911 return mod;
2790 2912
2791 err = check_modinfo(mod, info); 2913 err = check_modinfo(mod, info, flags);
2792 if (err) 2914 if (err)
2793 return ERR_PTR(err); 2915 return ERR_PTR(err);
2794 2916
@@ -2861,31 +2983,142 @@ static int post_relocation(struct module *mod, const struct load_info *info)
2861 return module_finalize(info->hdr, info->sechdrs, mod); 2983 return module_finalize(info->hdr, info->sechdrs, mod);
2862} 2984}
2863 2985
2986/* Is this module of this name done loading? No locks held. */
2987static bool finished_loading(const char *name)
2988{
2989 struct module *mod;
2990 bool ret;
2991
2992 mutex_lock(&module_mutex);
2993 mod = find_module(name);
2994 ret = !mod || mod->state != MODULE_STATE_COMING;
2995 mutex_unlock(&module_mutex);
2996
2997 return ret;
2998}
2999
3000/* Call module constructors. */
3001static void do_mod_ctors(struct module *mod)
3002{
3003#ifdef CONFIG_CONSTRUCTORS
3004 unsigned long i;
3005
3006 for (i = 0; i < mod->num_ctors; i++)
3007 mod->ctors[i]();
3008#endif
3009}
3010
3011/* This is where the real work happens */
3012static int do_init_module(struct module *mod)
3013{
3014 int ret = 0;
3015
3016 blocking_notifier_call_chain(&module_notify_list,
3017 MODULE_STATE_COMING, mod);
3018
3019 /* Set RO and NX regions for core */
3020 set_section_ro_nx(mod->module_core,
3021 mod->core_text_size,
3022 mod->core_ro_size,
3023 mod->core_size);
3024
3025 /* Set RO and NX regions for init */
3026 set_section_ro_nx(mod->module_init,
3027 mod->init_text_size,
3028 mod->init_ro_size,
3029 mod->init_size);
3030
3031 do_mod_ctors(mod);
3032 /* Start the module */
3033 if (mod->init != NULL)
3034 ret = do_one_initcall(mod->init);
3035 if (ret < 0) {
3036 /* Init routine failed: abort. Try to protect us from
3037 buggy refcounters. */
3038 mod->state = MODULE_STATE_GOING;
3039 synchronize_sched();
3040 module_put(mod);
3041 blocking_notifier_call_chain(&module_notify_list,
3042 MODULE_STATE_GOING, mod);
3043 free_module(mod);
3044 wake_up_all(&module_wq);
3045 return ret;
3046 }
3047 if (ret > 0) {
3048 printk(KERN_WARNING
3049"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3050"%s: loading module anyway...\n",
3051 __func__, mod->name, ret,
3052 __func__);
3053 dump_stack();
3054 }
3055
3056 /* Now it's a first class citizen! */
3057 mod->state = MODULE_STATE_LIVE;
3058 blocking_notifier_call_chain(&module_notify_list,
3059 MODULE_STATE_LIVE, mod);
3060
3061 /* We need to finish all async code before the module init sequence is done */
3062 async_synchronize_full();
3063
3064 mutex_lock(&module_mutex);
3065 /* Drop initial reference. */
3066 module_put(mod);
3067 trim_init_extable(mod);
3068#ifdef CONFIG_KALLSYMS
3069 mod->num_symtab = mod->core_num_syms;
3070 mod->symtab = mod->core_symtab;
3071 mod->strtab = mod->core_strtab;
3072#endif
3073 unset_module_init_ro_nx(mod);
3074 module_free(mod, mod->module_init);
3075 mod->module_init = NULL;
3076 mod->init_size = 0;
3077 mod->init_ro_size = 0;
3078 mod->init_text_size = 0;
3079 mutex_unlock(&module_mutex);
3080 wake_up_all(&module_wq);
3081
3082 return 0;
3083}
3084
3085static int may_init_module(void)
3086{
3087 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3088 return -EPERM;
3089
3090 return 0;
3091}
3092
2864/* Allocate and load the module: note that size of section 0 is always 3093/* Allocate and load the module: note that size of section 0 is always
2865 zero, and we rely on this for optional sections. */ 3094 zero, and we rely on this for optional sections. */
2866static struct module *load_module(void __user *umod, 3095static int load_module(struct load_info *info, const char __user *uargs,
2867 unsigned long len, 3096 int flags)
2868 const char __user *uargs)
2869{ 3097{
2870 struct load_info info = { NULL, }; 3098 struct module *mod, *old;
2871 struct module *mod;
2872 long err; 3099 long err;
2873 3100
2874 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", 3101 err = module_sig_check(info);
2875 umod, len, uargs); 3102 if (err)
3103 goto free_copy;
2876 3104
2877 /* Copy in the blobs from userspace, check they are vaguely sane. */ 3105 err = elf_header_check(info);
2878 err = copy_and_check(&info, umod, len, uargs);
2879 if (err) 3106 if (err)
2880 return ERR_PTR(err); 3107 goto free_copy;
2881 3108
2882 /* Figure out module layout, and allocate all the memory. */ 3109 /* Figure out module layout, and allocate all the memory. */
2883 mod = layout_and_allocate(&info); 3110 mod = layout_and_allocate(info, flags);
2884 if (IS_ERR(mod)) { 3111 if (IS_ERR(mod)) {
2885 err = PTR_ERR(mod); 3112 err = PTR_ERR(mod);
2886 goto free_copy; 3113 goto free_copy;
2887 } 3114 }
2888 3115
3116#ifdef CONFIG_MODULE_SIG
3117 mod->sig_ok = info->sig_ok;
3118 if (!mod->sig_ok)
3119 add_taint_module(mod, TAINT_FORCED_MODULE);
3120#endif
3121
2889 /* Now module is in final location, initialize linked lists, etc. */ 3122 /* Now module is in final location, initialize linked lists, etc. */
2890 err = module_unload_init(mod); 3123 err = module_unload_init(mod);
2891 if (err) 3124 if (err)
@@ -2893,25 +3126,25 @@ static struct module *load_module(void __user *umod,
2893 3126
2894 /* Now we've got everything in the final locations, we can 3127 /* Now we've got everything in the final locations, we can
2895 * find optional sections. */ 3128 * find optional sections. */
2896 find_module_sections(mod, &info); 3129 find_module_sections(mod, info);
2897 3130
2898 err = check_module_license_and_versions(mod); 3131 err = check_module_license_and_versions(mod);
2899 if (err) 3132 if (err)
2900 goto free_unload; 3133 goto free_unload;
2901 3134
2902 /* Set up MODINFO_ATTR fields */ 3135 /* Set up MODINFO_ATTR fields */
2903 setup_modinfo(mod, &info); 3136 setup_modinfo(mod, info);
2904 3137
2905 /* Fix up syms, so that st_value is a pointer to location. */ 3138 /* Fix up syms, so that st_value is a pointer to location. */
2906 err = simplify_symbols(mod, &info); 3139 err = simplify_symbols(mod, info);
2907 if (err < 0) 3140 if (err < 0)
2908 goto free_modinfo; 3141 goto free_modinfo;
2909 3142
2910 err = apply_relocations(mod, &info); 3143 err = apply_relocations(mod, info);
2911 if (err < 0) 3144 if (err < 0)
2912 goto free_modinfo; 3145 goto free_modinfo;
2913 3146
2914 err = post_relocation(mod, &info); 3147 err = post_relocation(mod, info);
2915 if (err < 0) 3148 if (err < 0)
2916 goto free_modinfo; 3149 goto free_modinfo;
2917 3150
@@ -2934,21 +3167,31 @@ static struct module *load_module(void __user *umod,
2934 * function to insert in a way safe to concurrent readers. 3167 * function to insert in a way safe to concurrent readers.
2935 * The mutex protects against concurrent writers. 3168 * The mutex protects against concurrent writers.
2936 */ 3169 */
3170again:
2937 mutex_lock(&module_mutex); 3171 mutex_lock(&module_mutex);
2938 if (find_module(mod->name)) { 3172 if ((old = find_module(mod->name)) != NULL) {
3173 if (old->state == MODULE_STATE_COMING) {
3174 /* Wait in case it fails to load. */
3175 mutex_unlock(&module_mutex);
3176 err = wait_event_interruptible(module_wq,
3177 finished_loading(mod->name));
3178 if (err)
3179 goto free_arch_cleanup;
3180 goto again;
3181 }
2939 err = -EEXIST; 3182 err = -EEXIST;
2940 goto unlock; 3183 goto unlock;
2941 } 3184 }
2942 3185
2943 /* This has to be done once we're sure module name is unique. */ 3186 /* This has to be done once we're sure module name is unique. */
2944 dynamic_debug_setup(info.debug, info.num_debug); 3187 dynamic_debug_setup(info->debug, info->num_debug);
2945 3188
2946 /* Find duplicate symbols */ 3189 /* Find duplicate symbols */
2947 err = verify_export_symbols(mod); 3190 err = verify_export_symbols(mod);
2948 if (err < 0) 3191 if (err < 0)
2949 goto ddebug; 3192 goto ddebug;
2950 3193
2951 module_bug_finalize(info.hdr, info.sechdrs, mod); 3194 module_bug_finalize(info->hdr, info->sechdrs, mod);
2952 list_add_rcu(&mod->list, &modules); 3195 list_add_rcu(&mod->list, &modules);
2953 mutex_unlock(&module_mutex); 3196 mutex_unlock(&module_mutex);
2954 3197
@@ -2959,25 +3202,26 @@ static struct module *load_module(void __user *umod,
2959 goto unlink; 3202 goto unlink;
2960 3203
2961 /* Link in to syfs. */ 3204 /* Link in to syfs. */
2962 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); 3205 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
2963 if (err < 0) 3206 if (err < 0)
2964 goto unlink; 3207 goto unlink;
2965 3208
2966 /* Get rid of temporary copy. */ 3209 /* Get rid of temporary copy. */
2967 free_copy(&info); 3210 free_copy(info);
2968 3211
2969 /* Done! */ 3212 /* Done! */
2970 trace_module_load(mod); 3213 trace_module_load(mod);
2971 return mod; 3214
3215 return do_init_module(mod);
2972 3216
2973 unlink: 3217 unlink:
2974 mutex_lock(&module_mutex); 3218 mutex_lock(&module_mutex);
2975 /* Unlink carefully: kallsyms could be walking list. */ 3219 /* Unlink carefully: kallsyms could be walking list. */
2976 list_del_rcu(&mod->list); 3220 list_del_rcu(&mod->list);
2977 module_bug_cleanup(mod); 3221 module_bug_cleanup(mod);
2978 3222 wake_up_all(&module_wq);
2979 ddebug: 3223 ddebug:
2980 dynamic_debug_remove(info.debug); 3224 dynamic_debug_remove(info->debug);
2981 unlock: 3225 unlock:
2982 mutex_unlock(&module_mutex); 3226 mutex_unlock(&module_mutex);
2983 synchronize_sched(); 3227 synchronize_sched();
@@ -2989,106 +3233,52 @@ static struct module *load_module(void __user *umod,
2989 free_unload: 3233 free_unload:
2990 module_unload_free(mod); 3234 module_unload_free(mod);
2991 free_module: 3235 free_module:
2992 module_deallocate(mod, &info); 3236 module_deallocate(mod, info);
2993 free_copy: 3237 free_copy:
2994 free_copy(&info); 3238 free_copy(info);
2995 return ERR_PTR(err); 3239 return err;
2996}
2997
2998/* Call module constructors. */
2999static void do_mod_ctors(struct module *mod)
3000{
3001#ifdef CONFIG_CONSTRUCTORS
3002 unsigned long i;
3003
3004 for (i = 0; i < mod->num_ctors; i++)
3005 mod->ctors[i]();
3006#endif
3007} 3240}
3008 3241
3009/* This is where the real work happens */
3010SYSCALL_DEFINE3(init_module, void __user *, umod, 3242SYSCALL_DEFINE3(init_module, void __user *, umod,
3011 unsigned long, len, const char __user *, uargs) 3243 unsigned long, len, const char __user *, uargs)
3012{ 3244{
3013 struct module *mod; 3245 int err;
3014 int ret = 0; 3246 struct load_info info = { };
3015 3247
3016 /* Must have permission */ 3248 err = may_init_module();
3017 if (!capable(CAP_SYS_MODULE) || modules_disabled) 3249 if (err)
3018 return -EPERM; 3250 return err;
3019 3251
3020 /* Do all the hard work */ 3252 pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
3021 mod = load_module(umod, len, uargs); 3253 umod, len, uargs);
3022 if (IS_ERR(mod))
3023 return PTR_ERR(mod);
3024 3254
3025 blocking_notifier_call_chain(&module_notify_list, 3255 err = copy_module_from_user(umod, len, &info);
3026 MODULE_STATE_COMING, mod); 3256 if (err)
3257 return err;
3027 3258
3028 /* Set RO and NX regions for core */ 3259 return load_module(&info, uargs, 0);
3029 set_section_ro_nx(mod->module_core, 3260}
3030 mod->core_text_size,
3031 mod->core_ro_size,
3032 mod->core_size);
3033 3261
3034 /* Set RO and NX regions for init */ 3262SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
3035 set_section_ro_nx(mod->module_init, 3263{
3036 mod->init_text_size, 3264 int err;
3037 mod->init_ro_size, 3265 struct load_info info = { };
3038 mod->init_size);
3039 3266
3040 do_mod_ctors(mod); 3267 err = may_init_module();
3041 /* Start the module */ 3268 if (err)
3042 if (mod->init != NULL) 3269 return err;
3043 ret = do_one_initcall(mod->init);
3044 if (ret < 0) {
3045 /* Init routine failed: abort. Try to protect us from
3046 buggy refcounters. */
3047 mod->state = MODULE_STATE_GOING;
3048 synchronize_sched();
3049 module_put(mod);
3050 blocking_notifier_call_chain(&module_notify_list,
3051 MODULE_STATE_GOING, mod);
3052 free_module(mod);
3053 wake_up(&module_wq);
3054 return ret;
3055 }
3056 if (ret > 0) {
3057 printk(KERN_WARNING
3058"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3059"%s: loading module anyway...\n",
3060 __func__, mod->name, ret,
3061 __func__);
3062 dump_stack();
3063 }
3064 3270
3065 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 3271 pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
3066 mod->state = MODULE_STATE_LIVE;
3067 wake_up(&module_wq);
3068 blocking_notifier_call_chain(&module_notify_list,
3069 MODULE_STATE_LIVE, mod);
3070 3272
3071 /* We need to finish all async code before the module init sequence is done */ 3273 if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
3072 async_synchronize_full(); 3274 |MODULE_INIT_IGNORE_VERMAGIC))
3275 return -EINVAL;
3073 3276
3074 mutex_lock(&module_mutex); 3277 err = copy_module_from_fd(fd, &info);
3075 /* Drop initial reference. */ 3278 if (err)
3076 module_put(mod); 3279 return err;
3077 trim_init_extable(mod);
3078#ifdef CONFIG_KALLSYMS
3079 mod->num_symtab = mod->core_num_syms;
3080 mod->symtab = mod->core_symtab;
3081 mod->strtab = mod->core_strtab;
3082#endif
3083 unset_module_init_ro_nx(mod);
3084 module_free(mod, mod->module_init);
3085 mod->module_init = NULL;
3086 mod->init_size = 0;
3087 mod->init_ro_size = 0;
3088 mod->init_text_size = 0;
3089 mutex_unlock(&module_mutex);
3090 3280
3091 return 0; 3281 return load_module(&info, uargs, flags);
3092} 3282}
3093 3283
3094static inline int within(unsigned long addr, void *start, unsigned long size) 3284static inline int within(unsigned long addr, void *start, unsigned long size)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000000..f2970bddc5ea
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,249 @@
1/* Module signature checker
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/err.h>
14#include <crypto/public_key.h>
15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19/*
20 * Module signature information block.
21 *
22 * The constituents of the signature section are, in order:
23 *
24 * - Signer's name
25 * - Key identifier
26 * - Signature data
27 * - Information block
28 */
29struct module_signature {
30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */
32 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */
35 u8 __pad[3];
36 __be32 sig_len; /* Length of signature data */
37};
38
39/*
40 * Digest the module contents.
41 */
42static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
43 const void *mod,
44 unsigned long modlen)
45{
46 struct public_key_signature *pks;
47 struct crypto_shash *tfm;
48 struct shash_desc *desc;
49 size_t digest_size, desc_size;
50 int ret;
51
52 pr_devel("==>%s()\n", __func__);
53
54 /* Allocate the hashing algorithm we're going to need and find out how
55 * big the hash operational data will be.
56 */
57 tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
58 if (IS_ERR(tfm))
59 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
60
61 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
62 digest_size = crypto_shash_digestsize(tfm);
63
64 /* We allocate the hash operational data storage on the end of our
65 * context data and the digest output buffer on the end of that.
66 */
67 ret = -ENOMEM;
68 pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
69 if (!pks)
70 goto error_no_pks;
71
72 pks->pkey_hash_algo = hash;
73 pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
74 pks->digest_size = digest_size;
75
76 desc = (void *)pks + sizeof(*pks);
77 desc->tfm = tfm;
78 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
79
80 ret = crypto_shash_init(desc);
81 if (ret < 0)
82 goto error;
83
84 ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
85 if (ret < 0)
86 goto error;
87
88 crypto_free_shash(tfm);
89 pr_devel("<==%s() = ok\n", __func__);
90 return pks;
91
92error:
93 kfree(pks);
94error_no_pks:
95 crypto_free_shash(tfm);
96 pr_devel("<==%s() = %d\n", __func__, ret);
97 return ERR_PTR(ret);
98}
99
100/*
101 * Extract an MPI array from the signature data. This represents the actual
102 * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
103 * size of the MPI in bytes.
104 *
105 * RSA signatures only have one MPI, so currently we only read one.
106 */
107static int mod_extract_mpi_array(struct public_key_signature *pks,
108 const void *data, size_t len)
109{
110 size_t nbytes;
111 MPI mpi;
112
113 if (len < 3)
114 return -EBADMSG;
115 nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
116 data += 2;
117 len -= 2;
118 if (len != nbytes)
119 return -EBADMSG;
120
121 mpi = mpi_read_raw_data(data, nbytes);
122 if (!mpi)
123 return -ENOMEM;
124 pks->mpi[0] = mpi;
125 pks->nr_mpi = 1;
126 return 0;
127}
128
129/*
130 * Request an asymmetric key.
131 */
132static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
133 const u8 *key_id, size_t key_id_len)
134{
135 key_ref_t key;
136 size_t i;
137 char *id, *q;
138
139 pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
140
141 /* Construct an identifier. */
142 id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
143 if (!id)
144 return ERR_PTR(-ENOKEY);
145
146 memcpy(id, signer, signer_len);
147
148 q = id + signer_len;
149 *q++ = ':';
150 *q++ = ' ';
151 for (i = 0; i < key_id_len; i++) {
152 *q++ = hex_asc[*key_id >> 4];
153 *q++ = hex_asc[*key_id++ & 0x0f];
154 }
155
156 *q = 0;
157
158 pr_debug("Look up: \"%s\"\n", id);
159
160 key = keyring_search(make_key_ref(modsign_keyring, 1),
161 &key_type_asymmetric, id);
162 if (IS_ERR(key))
163 pr_warn("Request for unknown module key '%s' err %ld\n",
164 id, PTR_ERR(key));
165 kfree(id);
166
167 if (IS_ERR(key)) {
168 switch (PTR_ERR(key)) {
169 /* Hide some search errors */
170 case -EACCES:
171 case -ENOTDIR:
172 case -EAGAIN:
173 return ERR_PTR(-ENOKEY);
174 default:
175 return ERR_CAST(key);
176 }
177 }
178
179 pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
180 return key_ref_to_ptr(key);
181}
182
183/*
184 * Verify the signature on a module.
185 */
186int mod_verify_sig(const void *mod, unsigned long *_modlen)
187{
188 struct public_key_signature *pks;
189 struct module_signature ms;
190 struct key *key;
191 const void *sig;
192 size_t modlen = *_modlen, sig_len;
193 int ret;
194
195 pr_devel("==>%s(,%zu)\n", __func__, modlen);
196
197 if (modlen <= sizeof(ms))
198 return -EBADMSG;
199
200 memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
201 modlen -= sizeof(ms);
202
203 sig_len = be32_to_cpu(ms.sig_len);
204 if (sig_len >= modlen)
205 return -EBADMSG;
206 modlen -= sig_len;
207 if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
208 return -EBADMSG;
209 modlen -= (size_t)ms.signer_len + ms.key_id_len;
210
211 *_modlen = modlen;
212 sig = mod + modlen;
213
214 /* For the moment, only support RSA and X.509 identifiers */
215 if (ms.algo != PKEY_ALGO_RSA ||
216 ms.id_type != PKEY_ID_X509)
217 return -ENOPKG;
218
219 if (ms.hash >= PKEY_HASH__LAST ||
220 !pkey_hash_algo[ms.hash])
221 return -ENOPKG;
222
223 key = request_asymmetric_key(sig, ms.signer_len,
224 sig + ms.signer_len, ms.key_id_len);
225 if (IS_ERR(key))
226 return PTR_ERR(key);
227
228 pks = mod_make_digest(ms.hash, mod, modlen);
229 if (IS_ERR(pks)) {
230 ret = PTR_ERR(pks);
231 goto error_put_key;
232 }
233
234 ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
235 sig_len);
236 if (ret < 0)
237 goto error_free_pks;
238
239 ret = verify_signature(key, pks);
240 pr_devel("verify_signature() = %d\n", ret);
241
242error_free_pks:
243 mpi_free(pks->rsa.s);
244 kfree(pks);
245error_put_key:
246 key_put(key);
247 pr_devel("<==%s() = %d\n", __func__, ret);
248 return ret;
249}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc6..78e2ecb20165 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct fs_struct *new_fs) 60 struct task_struct *tsk, struct user_namespace *user_ns,
61 struct fs_struct *new_fs)
61{ 62{
62 struct nsproxy *new_nsp; 63 struct nsproxy *new_nsp;
63 int err; 64 int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
66 if (!new_nsp) 67 if (!new_nsp)
67 return ERR_PTR(-ENOMEM); 68 return ERR_PTR(-ENOMEM);
68 69
69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
70 if (IS_ERR(new_nsp->mnt_ns)) { 71 if (IS_ERR(new_nsp->mnt_ns)) {
71 err = PTR_ERR(new_nsp->mnt_ns); 72 err = PTR_ERR(new_nsp->mnt_ns);
72 goto out_ns; 73 goto out_ns;
73 } 74 }
74 75
75 new_nsp->uts_ns = copy_utsname(flags, tsk); 76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
76 if (IS_ERR(new_nsp->uts_ns)) { 77 if (IS_ERR(new_nsp->uts_ns)) {
77 err = PTR_ERR(new_nsp->uts_ns); 78 err = PTR_ERR(new_nsp->uts_ns);
78 goto out_uts; 79 goto out_uts;
79 } 80 }
80 81
81 new_nsp->ipc_ns = copy_ipcs(flags, tsk); 82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
82 if (IS_ERR(new_nsp->ipc_ns)) { 83 if (IS_ERR(new_nsp->ipc_ns)) {
83 err = PTR_ERR(new_nsp->ipc_ns); 84 err = PTR_ERR(new_nsp->ipc_ns);
84 goto out_ipc; 85 goto out_ipc;
85 } 86 }
86 87
87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
88 if (IS_ERR(new_nsp->pid_ns)) { 89 if (IS_ERR(new_nsp->pid_ns)) {
89 err = PTR_ERR(new_nsp->pid_ns); 90 err = PTR_ERR(new_nsp->pid_ns);
90 goto out_pid; 91 goto out_pid;
91 } 92 }
92 93
93 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
94 if (IS_ERR(new_nsp->net_ns)) { 95 if (IS_ERR(new_nsp->net_ns)) {
95 err = PTR_ERR(new_nsp->net_ns); 96 err = PTR_ERR(new_nsp->net_ns);
96 goto out_net; 97 goto out_net;
@@ -122,6 +123,7 @@ out_ns:
122int copy_namespaces(unsigned long flags, struct task_struct *tsk) 123int copy_namespaces(unsigned long flags, struct task_struct *tsk)
123{ 124{
124 struct nsproxy *old_ns = tsk->nsproxy; 125 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
125 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
126 int err = 0; 128 int err = 0;
127 129
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
134 CLONE_NEWPID | CLONE_NEWNET))) 136 CLONE_NEWPID | CLONE_NEWNET)))
135 return 0; 137 return 0;
136 138
137 if (!capable(CAP_SYS_ADMIN)) { 139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
138 err = -EPERM; 140 err = -EPERM;
139 goto out; 141 goto out;
140 } 142 }
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
151 goto out; 153 goto out;
152 } 154 }
153 155
154 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 156 new_ns = create_new_namespaces(flags, tsk,
157 task_cred_xxx(tsk, user_ns), tsk->fs);
155 if (IS_ERR(new_ns)) { 158 if (IS_ERR(new_ns)) {
156 err = PTR_ERR(new_ns); 159 err = PTR_ERR(new_ns);
157 goto out; 160 goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
183 * On success, returns the new nsproxy. 186 * On success, returns the new nsproxy.
184 */ 187 */
185int unshare_nsproxy_namespaces(unsigned long unshare_flags, 188int unshare_nsproxy_namespaces(unsigned long unshare_flags,
186 struct nsproxy **new_nsp, struct fs_struct *new_fs) 189 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
187{ 190{
191 struct user_namespace *user_ns;
188 int err = 0; 192 int err = 0;
189 193
190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
191 CLONE_NEWNET))) 195 CLONE_NEWNET | CLONE_NEWPID)))
192 return 0; 196 return 0;
193 197
194 if (!capable(CAP_SYS_ADMIN)) 198 user_ns = new_cred ? new_cred->user_ns : current_user_ns();
199 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
195 return -EPERM; 200 return -EPERM;
196 201
197 *new_nsp = create_new_namespaces(unshare_flags, current, 202 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
198 new_fs ? new_fs : current->fs); 203 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) { 204 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 205 err = PTR_ERR(*new_nsp);
201 goto out; 206 goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 struct file *file; 246 struct file *file;
242 int err; 247 int err;
243 248
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd); 249 file = proc_ns_fget(fd);
248 if (IS_ERR(file)) 250 if (IS_ERR(file))
249 return PTR_ERR(file); 251 return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
254 if (nstype && (ops->type != nstype)) 256 if (nstype && (ops->type != nstype))
255 goto out; 257 goto out;
256 258
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 259 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
258 if (IS_ERR(new_nsproxy)) { 260 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy); 261 err = PTR_ERR(new_nsproxy);
260 goto out; 262 goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
171{ 171{
172 int cpu, num_cpus; 172 int cpu, num_cpus;
173 unsigned int next_nr, next_index; 173 unsigned int next_nr, next_index;
174 struct padata_parallel_queue *queue, *next_queue; 174 struct padata_parallel_queue *next_queue;
175 struct padata_priv *padata; 175 struct padata_priv *padata;
176 struct padata_list *reorder; 176 struct padata_list *reorder;
177 177
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
204 goto out; 204 goto out;
205 } 205 }
206 206
207 queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); 207 if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
208 if (queue->cpu_index == next_queue->cpu_index) {
209 padata = ERR_PTR(-ENODATA); 208 padata = ERR_PTR(-ENODATA);
210 goto out; 209 goto out;
211 } 210 }
diff --git a/kernel/pid.c b/kernel/pid.c
index e86b291ad834..36aa02ff17d6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 William Irwin, IBM 4 * (C) 2002-2003 Nadia Yvette Chambers, IBM
5 * (C) 2004 William Irwin, Oracle 5 * (C) 2004 Nadia Yvette Chambers, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
98
99/* 87/*
100 * Note: disable interrupts while the pidmap_lock is held as an 88 * Note: disable interrupts while the pidmap_lock is held as an
101 * interrupt might come in and do read_lock(&tasklist_lock). 89 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,24 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 257 unsigned long flags;
270 258
271 spin_lock_irqsave(&pidmap_lock, flags); 259 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 260 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 261 struct upid *upid = pid->numbers + i;
262 struct pid_namespace *ns = upid->ns;
263 hlist_del_rcu(&upid->pid_chain);
264 switch(--ns->nr_hashed) {
265 case 1:
266 /* When all that is left in the pid namespace
267 * is the reaper wake up the reaper. The reaper
268 * may be sleeping in zap_pid_ns_processes().
269 */
270 wake_up_process(ns->child_reaper);
271 break;
272 case 0:
273 ns->nr_hashed = -1;
274 schedule_work(&ns->proc_work);
275 break;
276 }
277 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 278 spin_unlock_irqrestore(&pidmap_lock, flags);
275 279
276 for (i = 0; i <= pid->level; i++) 280 for (i = 0; i <= pid->level; i++)
@@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 296 goto out;
293 297
294 tmp = ns; 298 tmp = ns;
299 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 300 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 301 nr = alloc_pidmap(tmp);
297 if (nr < 0) 302 if (nr < 0)
@@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 307 tmp = tmp->parent;
303 } 308 }
304 309
310 if (unlikely(is_child_reaper(pid))) {
311 if (pid_ns_prepare_proc(ns))
312 goto out_free;
313 }
314
305 get_pid_ns(ns); 315 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 316 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 317 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 318 INIT_HLIST_HEAD(&pid->tasks[type]);
310 319
311 upid = pid->numbers + ns->level; 320 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 321 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 322 if (ns->nr_hashed < 0)
323 goto out_unlock;
324 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 325 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 326 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
327 upid->ns->nr_hashed++;
328 }
316 spin_unlock_irq(&pidmap_lock); 329 spin_unlock_irq(&pidmap_lock);
317 330
318out: 331out:
319 return pid; 332 return pid;
320 333
334out_unlock:
335 spin_unlock(&pidmap_lock);
321out_free: 336out_free:
322 while (++i <= ns->level) 337 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 338 free_pidmap(pid->numbers + i);
@@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 359
345struct pid *find_vpid(int nr) 360struct pid *find_vpid(int nr)
346{ 361{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 362 return find_pid_ns(nr, task_active_pid_ns(current));
348} 363}
349EXPORT_SYMBOL_GPL(find_vpid); 364EXPORT_SYMBOL_GPL(find_vpid);
350 365
@@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 443
429struct task_struct *find_task_by_vpid(pid_t vnr) 444struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 445{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 446 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 447}
433 448
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 449struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -479,10 +494,11 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
479 } 494 }
480 return nr; 495 return nr;
481} 496}
497EXPORT_SYMBOL_GPL(pid_nr_ns);
482 498
483pid_t pid_vnr(struct pid *pid) 499pid_t pid_vnr(struct pid *pid)
484{ 500{
485 return pid_nr_ns(pid, current->nsproxy->pid_ns); 501 return pid_nr_ns(pid, task_active_pid_ns(current));
486} 502}
487EXPORT_SYMBOL_GPL(pid_vnr); 503EXPORT_SYMBOL_GPL(pid_vnr);
488 504
@@ -493,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
493 509
494 rcu_read_lock(); 510 rcu_read_lock();
495 if (!ns) 511 if (!ns)
496 ns = current->nsproxy->pid_ns; 512 ns = task_active_pid_ns(current);
497 if (likely(pid_alive(task))) { 513 if (likely(pid_alive(task))) {
498 if (type != PIDTYPE_PID) 514 if (type != PIDTYPE_PID)
499 task = task->group_leader; 515 task = task->group_leader;
@@ -568,6 +584,7 @@ void __init pidmap_init(void)
568 /* Reserve PID 0. We never call free_pidmap(0) */ 584 /* Reserve PID 0. We never call free_pidmap(0) */
569 set_bit(0, init_pid_ns.pidmap[0].page); 585 set_bit(0, init_pid_ns.pidmap[0].page);
570 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 586 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
587 init_pid_ns.nr_hashed = 1;
571 588
572 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 589 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
573 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 590 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6144bab8fd8e..fdbd0cdf271a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,12 +10,14 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/err.h> 15#include <linux/err.h>
15#include <linux/acct.h> 16#include <linux/acct.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
18#include <linux/reboot.h> 19#include <linux/reboot.h>
20#include <linux/export.h>
19 21
20#define BITS_PER_PAGE (PAGE_SIZE*8) 22#define BITS_PER_PAGE (PAGE_SIZE*8)
21 23
@@ -70,12 +72,29 @@ err_alloc:
70 return NULL; 72 return NULL;
71} 73}
72 74
73static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
82#define MAX_PID_NS_LEVEL 32
83
84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
74{ 86{
75 struct pid_namespace *ns; 87 struct pid_namespace *ns;
76 unsigned int level = parent_pid_ns->level + 1; 88 unsigned int level = parent_pid_ns->level + 1;
77 int i, err = -ENOMEM; 89 int i;
90 int err;
78 91
92 if (level > MAX_PID_NS_LEVEL) {
93 err = -EINVAL;
94 goto out;
95 }
96
97 err = -ENOMEM;
79 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 98 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
80 if (ns == NULL) 99 if (ns == NULL)
81 goto out; 100 goto out;
@@ -88,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
88 if (ns->pid_cachep == NULL) 107 if (ns->pid_cachep == NULL)
89 goto out_free_map; 108 goto out_free_map;
90 109
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
91 kref_init(&ns->kref); 114 kref_init(&ns->kref);
92 ns->level = level; 115 ns->level = level;
93 ns->parent = get_pid_ns(parent_pid_ns); 116 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 INIT_WORK(&ns->proc_work, proc_cleanup_work);
94 119
95 set_bit(0, ns->pidmap[0].page); 120 set_bit(0, ns->pidmap[0].page);
96 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 121 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -98,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
98 for (i = 1; i < PIDMAP_ENTRIES; i++) 123 for (i = 1; i < PIDMAP_ENTRIES; i++)
99 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 124 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
100 125
101 err = pid_ns_prepare_proc(ns);
102 if (err)
103 goto out_put_parent_pid_ns;
104
105 return ns; 126 return ns;
106 127
107out_put_parent_pid_ns:
108 put_pid_ns(parent_pid_ns);
109out_free_map: 128out_free_map:
110 kfree(ns->pidmap[0].page); 129 kfree(ns->pidmap[0].page);
111out_free: 130out_free:
@@ -118,32 +137,43 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
118{ 137{
119 int i; 138 int i;
120 139
140 proc_free_inum(ns->proc_inum);
121 for (i = 0; i < PIDMAP_ENTRIES; i++) 141 for (i = 0; i < PIDMAP_ENTRIES; i++)
122 kfree(ns->pidmap[i].page); 142 kfree(ns->pidmap[i].page);
143 put_user_ns(ns->user_ns);
123 kmem_cache_free(pid_ns_cachep, ns); 144 kmem_cache_free(pid_ns_cachep, ns);
124} 145}
125 146
126struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 147struct pid_namespace *copy_pid_ns(unsigned long flags,
148 struct user_namespace *user_ns, struct pid_namespace *old_ns)
127{ 149{
128 if (!(flags & CLONE_NEWPID)) 150 if (!(flags & CLONE_NEWPID))
129 return get_pid_ns(old_ns); 151 return get_pid_ns(old_ns);
130 if (flags & (CLONE_THREAD|CLONE_PARENT)) 152 if (task_active_pid_ns(current) != old_ns)
131 return ERR_PTR(-EINVAL); 153 return ERR_PTR(-EINVAL);
132 return create_pid_namespace(old_ns); 154 return create_pid_namespace(user_ns, old_ns);
133} 155}
134 156
135void free_pid_ns(struct kref *kref) 157static void free_pid_ns(struct kref *kref)
136{ 158{
137 struct pid_namespace *ns, *parent; 159 struct pid_namespace *ns;
138 160
139 ns = container_of(kref, struct pid_namespace, kref); 161 ns = container_of(kref, struct pid_namespace, kref);
140
141 parent = ns->parent;
142 destroy_pid_namespace(ns); 162 destroy_pid_namespace(ns);
163}
143 164
144 if (parent != NULL) 165void put_pid_ns(struct pid_namespace *ns)
145 put_pid_ns(parent); 166{
167 struct pid_namespace *parent;
168
169 while (ns != &init_pid_ns) {
170 parent = ns->parent;
171 if (!kref_put(&ns->kref, free_pid_ns))
172 break;
173 ns = parent;
174 }
146} 175}
176EXPORT_SYMBOL_GPL(put_pid_ns);
147 177
148void zap_pid_ns_processes(struct pid_namespace *pid_ns) 178void zap_pid_ns_processes(struct pid_namespace *pid_ns)
149{ 179{
@@ -192,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
192 222
193 /* 223 /*
194 * sys_wait4() above can't reap the TASK_DEAD children. 224 * sys_wait4() above can't reap the TASK_DEAD children.
195 * Make sure they all go away, see __unhash_process(). 225 * Make sure they all go away, see free_pid().
196 */ 226 */
197 for (;;) { 227 for (;;) {
198 bool need_wait = false; 228 set_current_state(TASK_UNINTERRUPTIBLE);
199 229 if (pid_ns->nr_hashed == 1)
200 read_lock(&tasklist_lock);
201 if (!list_empty(&current->children)) {
202 __set_current_state(TASK_UNINTERRUPTIBLE);
203 need_wait = true;
204 }
205 read_unlock(&tasklist_lock);
206
207 if (!need_wait)
208 break; 230 break;
209 schedule(); 231 schedule();
210 } 232 }
233 __set_current_state(TASK_RUNNING);
211 234
212 if (pid_ns->reboot) 235 if (pid_ns->reboot)
213 current->signal->group_exit_code = pid_ns->reboot; 236 current->signal->group_exit_code = pid_ns->reboot;
@@ -220,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
220static int pid_ns_ctl_handler(struct ctl_table *table, int write, 243static int pid_ns_ctl_handler(struct ctl_table *table, int write,
221 void __user *buffer, size_t *lenp, loff_t *ppos) 244 void __user *buffer, size_t *lenp, loff_t *ppos)
222{ 245{
246 struct pid_namespace *pid_ns = task_active_pid_ns(current);
223 struct ctl_table tmp = *table; 247 struct ctl_table tmp = *table;
224 248
225 if (write && !capable(CAP_SYS_ADMIN)) 249 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
226 return -EPERM; 250 return -EPERM;
227 251
228 /* 252 /*
@@ -231,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
231 * it should synchronize its usage with external means. 255 * it should synchronize its usage with external means.
232 */ 256 */
233 257
234 tmp.data = &current->nsproxy->pid_ns->last_pid; 258 tmp.data = &pid_ns->last_pid;
235 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 259 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
236} 260}
237 261
@@ -280,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
280 return 0; 304 return 0;
281} 305}
282 306
307static void *pidns_get(struct task_struct *task)
308{
309 struct pid_namespace *ns;
310
311 rcu_read_lock();
312 ns = get_pid_ns(task_active_pid_ns(task));
313 rcu_read_unlock();
314
315 return ns;
316}
317
318static void pidns_put(void *ns)
319{
320 put_pid_ns(ns);
321}
322
323static int pidns_install(struct nsproxy *nsproxy, void *ns)
324{
325 struct pid_namespace *active = task_active_pid_ns(current);
326 struct pid_namespace *ancestor, *new = ns;
327
328 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
329 !nsown_capable(CAP_SYS_ADMIN))
330 return -EPERM;
331
332 /*
333 * Only allow entering the current active pid namespace
334 * or a child of the current active pid namespace.
335 *
336 * This is required for fork to return a usable pid value and
337 * this maintains the property that processes and their
338 * children can not escape their current pid namespace.
339 */
340 if (new->level < active->level)
341 return -EINVAL;
342
343 ancestor = new;
344 while (ancestor->level > active->level)
345 ancestor = ancestor->parent;
346 if (ancestor != active)
347 return -EINVAL;
348
349 put_pid_ns(nsproxy->pid_ns);
350 nsproxy->pid_ns = get_pid_ns(new);
351 return 0;
352}
353
354static unsigned int pidns_inum(void *ns)
355{
356 struct pid_namespace *pid_ns = ns;
357 return pid_ns->proc_inum;
358}
359
360const struct proc_ns_operations pidns_operations = {
361 .name = "pid",
362 .type = CLONE_NEWPID,
363 .get = pidns_get,
364 .put = pidns_put,
365 .install = pidns_install,
366 .inum = pidns_inum,
367};
368
283static __init int pid_namespaces_init(void) 369static __init int pid_namespaces_init(void)
284{ 370{
285 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 371 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa21..a278cad1d5d6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h>
12 13
13/* 14/*
14 * Called after updating RLIMIT_CPU to run cpu timer and update 15 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -217,30 +218,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
217 return 0; 218 return 0;
218} 219}
219 220
220void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
221{
222 struct signal_struct *sig = tsk->signal;
223 struct task_struct *t;
224
225 times->utime = sig->utime;
226 times->stime = sig->stime;
227 times->sum_exec_runtime = sig->sum_sched_runtime;
228
229 rcu_read_lock();
230 /* make sure we can trust tsk->thread_group list */
231 if (!likely(pid_alive(tsk)))
232 goto out;
233
234 t = tsk;
235 do {
236 times->utime += t->utime;
237 times->stime += t->stime;
238 times->sum_exec_runtime += task_sched_runtime(t);
239 } while_each_thread(tsk, t);
240out:
241 rcu_read_unlock();
242}
243
244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 221static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
245{ 222{
246 if (b->utime > a->utime) 223 if (b->utime > a->utime)
@@ -494,6 +471,8 @@ static void cleanup_timers(struct list_head *head,
494 */ 471 */
495void posix_cpu_timers_exit(struct task_struct *tsk) 472void posix_cpu_timers_exit(struct task_struct *tsk)
496{ 473{
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long));
497 cleanup_timers(tsk->cpu_timers, 476 cleanup_timers(tsk->cpu_timers,
498 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
499 478
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a70518c9d82f..5dfdc9ea180b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS
263 bool 263 bool
264 depends on PM 264 depends on PM
265 265
266config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS
269
266config PM_GENERIC_DOMAINS_RUNTIME 270config PM_GENERIC_DOMAINS_RUNTIME
267 def_bool y 271 def_bool y
268 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 272 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f458238109cc..1c16f9167de1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
59{ 59{
60 unsigned long val; 60 unsigned long val;
61 61
62 if (strict_strtoul(buf, 10, &val)) 62 if (kstrtoul(buf, 10, &val))
63 return -EINVAL; 63 return -EINVAL;
64 64
65 if (val > 1) 65 if (val > 1)
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index d52359374e85..68197a4e8fc9 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int __init pm_sysrq_init(void)
41{ 41{
42 register_sysrq_key('o', &sysrq_poweroff_op); 42 register_sysrq_key('o', &sysrq_poweroff_op);
43 return 0; 43 return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 19db29f67558..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
@@ -79,7 +68,7 @@ static int try_to_freeze_tasks(bool user_only)
79 68
80 /* 69 /*
81 * We need to retry, but first give the freezing tasks some 70 * We need to retry, but first give the freezing tasks some
82 * time to enter the regrigerator. 71 * time to enter the refrigerator.
83 */ 72 */
84 msleep(10); 73 msleep(10);
85 } 74 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 6a031e684026..9322ff7eaad6 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
139 default: 139 default:
140 /* runtime check for not using enum */ 140 /* runtime check for not using enum */
141 BUG(); 141 BUG();
142 return PM_QOS_DEFAULT_VALUE;
142 } 143 }
143} 144}
144 145
@@ -212,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
212} 213}
213 214
214/** 215/**
216 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
217 * @pqf: Device PM QoS flags set to remove the request from.
218 * @req: Request to remove from the set.
219 */
220static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
221 struct pm_qos_flags_request *req)
222{
223 s32 val = 0;
224
225 list_del(&req->node);
226 list_for_each_entry(req, &pqf->list, node)
227 val |= req->flags;
228
229 pqf->effective_flags = val;
230}
231
232/**
233 * pm_qos_update_flags - Update a set of PM QoS flags.
234 * @pqf: Set of flags to update.
235 * @req: Request to add to the set, to modify, or to remove from the set.
236 * @action: Action to take on the set.
237 * @val: Value of the request to add or modify.
238 *
239 * Update the given set of PM QoS flags and call notifiers if the aggregate
240 * value has changed. Returns 1 if the aggregate constraint value has changed,
241 * 0 otherwise.
242 */
243bool pm_qos_update_flags(struct pm_qos_flags *pqf,
244 struct pm_qos_flags_request *req,
245 enum pm_qos_req_action action, s32 val)
246{
247 unsigned long irqflags;
248 s32 prev_value, curr_value;
249
250 spin_lock_irqsave(&pm_qos_lock, irqflags);
251
252 prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
253
254 switch (action) {
255 case PM_QOS_REMOVE_REQ:
256 pm_qos_flags_remove_req(pqf, req);
257 break;
258 case PM_QOS_UPDATE_REQ:
259 pm_qos_flags_remove_req(pqf, req);
260 case PM_QOS_ADD_REQ:
261 req->flags = val;
262 INIT_LIST_HEAD(&req->node);
263 list_add_tail(&req->node, &pqf->list);
264 pqf->effective_flags |= val;
265 break;
266 default:
267 /* no action */
268 ;
269 }
270
271 curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
272
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274
275 return prev_value != curr_value;
276}
277
278/**
215 * pm_qos_request - returns current system wide qos expectation 279 * pm_qos_request - returns current system wide qos expectation
216 * @pm_qos_class: identification of which qos value is requested 280 * @pm_qos_class: identification of which qos value is requested
217 * 281 *
@@ -499,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
499 } else { 563 } else {
500 ascii_value[count] = '\0'; 564 ascii_value[count] = '\0';
501 } 565 }
502 ret = strict_strtoul(ascii_value, 16, &ulval); 566 ret = kstrtoul(ascii_value, 16, &ulval);
503 if (ret) { 567 if (ret) {
504 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); 568 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
505 return -EINVAL; 569 return -EINVAL;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3c9d764eb0d8..7c33ed200410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
126 126
127 /* Figure out where to put the new node */ 127 /* Figure out where to put the new node */
128 while (*new) { 128 while (*new) {
129 ext = container_of(*new, struct swsusp_extent, node); 129 ext = rb_entry(*new, struct swsusp_extent, node);
130 parent = *new; 130 parent = *new;
131 if (swap_offset < ext->start) { 131 if (swap_offset < ext->start) {
132 /* Try to merge */ 132 /* Try to merge */
diff --git a/kernel/printk.c b/kernel/printk.c
index 66a2ea37b576..19c0d7bcf24a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 87struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 88EXPORT_SYMBOL_GPL(console_drivers);
89 89
90#ifdef CONFIG_LOCKDEP
91static struct lockdep_map console_lock_dep_map = {
92 .name = "console_lock"
93};
94#endif
95
90/* 96/*
91 * This is used for debugging the mess that is the VT code by 97 * This is used for debugging the mess that is the VT code by
92 * keeping track if we have the console semaphore held. It's 98 * keeping track if we have the console semaphore held. It's
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early)
741 free, (free * 100) / __LOG_BUF_LEN); 747 free, (free * 100) / __LOG_BUF_LEN);
742} 748}
743 749
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
744#ifdef CONFIG_BOOT_PRINTK_DELAY 765#ifdef CONFIG_BOOT_PRINTK_DELAY
745 766
746static int boot_delay; /* msecs delay after each printk during bootup */ 767static int boot_delay; /* msecs delay after each printk during bootup */
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str)
764} 785}
765__setup("boot_delay=", boot_delay_setup); 786__setup("boot_delay=", boot_delay_setup);
766 787
767static void boot_delay_msec(void) 788static void boot_delay_msec(int level)
768{ 789{
769 unsigned long long k; 790 unsigned long long k;
770 unsigned long timeout; 791 unsigned long timeout;
771 792
772 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
773 return; 795 return;
796 }
774 797
775 k = (unsigned long long)loops_per_msec * boot_delay; 798 k = (unsigned long long)loops_per_msec * boot_delay;
776 799
@@ -789,7 +812,7 @@ static void boot_delay_msec(void)
789 } 812 }
790} 813}
791#else 814#else
792static inline void boot_delay_msec(void) 815static inline void boot_delay_msec(int level)
793{ 816{
794} 817}
795#endif 818#endif
@@ -1232,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1232 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1255 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1233} 1256}
1234 1257
1235static bool __read_mostly ignore_loglevel;
1236
1237static int __init ignore_loglevel_setup(char *str)
1238{
1239 ignore_loglevel = 1;
1240 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
1241
1242 return 0;
1243}
1244
1245early_param("ignore_loglevel", ignore_loglevel_setup);
1246module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
1247MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
1248 "print all kernel messages to the console.");
1249
1250/* 1258/*
1251 * Call the console drivers, asking them to write out 1259 * Call the console drivers, asking them to write out
1252 * log_buf[start] to log_buf[end - 1]. 1260 * log_buf[start] to log_buf[end - 1].
@@ -1492,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1492 int this_cpu; 1500 int this_cpu;
1493 int printed_len = 0; 1501 int printed_len = 0;
1494 1502
1495 boot_delay_msec(); 1503 boot_delay_msec(level);
1496 printk_delay(); 1504 printk_delay();
1497 1505
1498 /* This stops the holder of console_sem just where we want him */ 1506 /* This stops the holder of console_sem just where we want him */
@@ -1890,7 +1898,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1890 switch (action) { 1898 switch (action) {
1891 case CPU_ONLINE: 1899 case CPU_ONLINE:
1892 case CPU_DEAD: 1900 case CPU_DEAD:
1893 case CPU_DYING:
1894 case CPU_DOWN_FAILED: 1901 case CPU_DOWN_FAILED:
1895 case CPU_UP_CANCELED: 1902 case CPU_UP_CANCELED:
1896 console_lock(); 1903 console_lock();
@@ -1909,12 +1916,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1909 */ 1916 */
1910void console_lock(void) 1917void console_lock(void)
1911{ 1918{
1912 BUG_ON(in_interrupt()); 1919 might_sleep();
1920
1913 down(&console_sem); 1921 down(&console_sem);
1914 if (console_suspended) 1922 if (console_suspended)
1915 return; 1923 return;
1916 console_locked = 1; 1924 console_locked = 1;
1917 console_may_schedule = 1; 1925 console_may_schedule = 1;
1926 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1918} 1927}
1919EXPORT_SYMBOL(console_lock); 1928EXPORT_SYMBOL(console_lock);
1920 1929
@@ -1936,6 +1945,7 @@ int console_trylock(void)
1936 } 1945 }
1937 console_locked = 1; 1946 console_locked = 1;
1938 console_may_schedule = 0; 1947 console_may_schedule = 0;
1948 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1939 return 1; 1949 return 1;
1940} 1950}
1941EXPORT_SYMBOL(console_trylock); 1951EXPORT_SYMBOL(console_trylock);
@@ -2096,6 +2106,7 @@ skip:
2096 local_irq_restore(flags); 2106 local_irq_restore(flags);
2097 } 2107 }
2098 console_locked = 0; 2108 console_locked = 0;
2109 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2099 2110
2100 /* Release the exclusive_console once it is used */ 2111 /* Release the exclusive_console once it is used */
2101 if (unlikely(exclusive_console)) 2112 if (unlikely(exclusive_console))
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773ee..1f391819c42f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004 9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling, 10 * Consolidation of architecture support code for profiling,
11 * William Irwin, Oracle, July 2004 11 * Nadia Yvette Chambers, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables 12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
14 * Oracle, 2004
14 */ 15 */
15 16
16#include <linux/export.h> 17#include <linux/export.h>
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
256 * pagetable hash functions, but uses a full hashtable full of finite 257 * pagetable hash functions, but uses a full hashtable full of finite
257 * collision chains, not just pairs of them. 258 * collision chains, not just pairs of them.
258 * 259 *
259 * -- wli 260 * -- nyc
260 */ 261 */
261static void __profile_flip_buffers(void *unused) 262static void __profile_flip_buffers(void *unused)
262{ 263{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a232bb59d93f..1599157336a6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE); 180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181} 181}
182 182
183int __ptrace_may_access(struct task_struct *task, unsigned int mode) 183/* Returns 0 on success, -errno on denial. */
184static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
184{ 185{
185 const struct cred *cred = current_cred(), *tcred; 186 const struct cred *cred = current_cred(), *tcred;
186 187
@@ -214,8 +215,12 @@ ok:
214 smp_rmb(); 215 smp_rmb();
215 if (task->mm) 216 if (task->mm)
216 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
217 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 rcu_read_lock();
219 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
220 rcu_read_unlock();
218 return -EPERM; 221 return -EPERM;
222 }
223 rcu_read_unlock();
219 224
220 return security_ptrace_access_check(task, mode); 225 return security_ptrace_access_check(task, mode);
221} 226}
@@ -279,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
279 284
280 if (seize) 285 if (seize)
281 flags |= PT_SEIZED; 286 flags |= PT_SEIZED;
282 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 287 rcu_read_lock();
288 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
283 flags |= PT_PTRACE_CAP; 289 flags |= PT_PTRACE_CAP;
290 rcu_read_unlock();
284 task->ptrace = flags; 291 task->ptrace = flags;
285 292
286 __ptrace_link(task, current); 293 __ptrace_link(task, current);
@@ -456,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer)
456 return; 463 return;
457 464
458 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
466 if (unlikely(p->ptrace & PT_EXITKILL))
467 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
468
459 if (__ptrace_detach(tracer, p)) 469 if (__ptrace_detach(tracer, p))
460 list_add(&p->ptrace_entry, &ptrace_dead); 470 list_add(&p->ptrace_entry, &ptrace_dead);
461 } 471 }
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..20dfba576c2b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
112#endif /* __LINUX_RCU_H */ 114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4e6a61b15e86..a2cf76177b44 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,12 +45,16 @@
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h>
49#include <linux/module.h>
48 50
49#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
51 53
52#include "rcu.h" 54#include "rcu.h"
53 55
56module_param(rcu_expedited, int, 0);
57
54#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
55 59
56/* 60/*
@@ -81,6 +85,9 @@ void __rcu_read_unlock(void)
81 } else { 85 } else {
82 barrier(); /* critical section before exit code. */ 86 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN; 87 t->rcu_read_lock_nesting = INT_MIN;
88#ifdef CONFIG_PROVE_RCU_DELAY
89 udelay(10); /* Make preemption more probable. */
90#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
84 barrier(); /* assign before ->rcu_read_unlock_special load */ 91 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 92 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t); 93 rcu_read_unlock_special(t);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 547b1fe5b052..e7dce58f9c2a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head,
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
61 if (rcu_dynticks_nesting) { 61 if (newval) {
62 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting)); 63 rcu_dynticks_nesting, newval));
64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); 67 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 68 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 69 struct task_struct *idle = idle_task(smp_processor_id());
69 70
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 71 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting)); 72 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 73 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 74 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm, 75 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */ 76 idle->pid, idle->comm); /* must be idle task! */
76 } 77 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 78 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
79 barrier();
80 rcu_dynticks_nesting = newval;
78} 81}
79 82
80/* 83/*
@@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval)
84void rcu_idle_enter(void) 87void rcu_idle_enter(void)
85{ 88{
86 unsigned long flags; 89 unsigned long flags;
87 long long oldval; 90 long long newval;
88 91
89 local_irq_save(flags); 92 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); 93 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 94 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE) 95 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0; 96 newval = 0;
95 else 97 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 98 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
97 rcu_idle_enter_common(oldval); 99 rcu_idle_enter_common(newval);
98 local_irq_restore(flags); 100 local_irq_restore(flags);
99} 101}
100EXPORT_SYMBOL_GPL(rcu_idle_enter); 102EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
105void rcu_irq_exit(void) 107void rcu_irq_exit(void)
106{ 108{
107 unsigned long flags; 109 unsigned long flags;
108 long long oldval; 110 long long newval;
109 111
110 local_irq_save(flags); 112 local_irq_save(flags);
111 oldval = rcu_dynticks_nesting; 113 newval = rcu_dynticks_nesting - 1;
112 rcu_dynticks_nesting--; 114 WARN_ON_ONCE(newval < 0);
113 WARN_ON_ONCE(rcu_dynticks_nesting < 0); 115 rcu_idle_enter_common(newval);
114 rcu_idle_enter_common(oldval);
115 local_irq_restore(flags); 116 local_irq_restore(flags);
116} 117}
118EXPORT_SYMBOL_GPL(rcu_irq_exit);
117 119
118/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 120/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
119static void rcu_idle_exit_common(long long oldval) 121static void rcu_idle_exit_common(long long oldval)
@@ -171,6 +173,7 @@ void rcu_irq_enter(void)
171 rcu_idle_exit_common(oldval); 173 rcu_idle_exit_common(oldval);
172 local_irq_restore(flags); 174 local_irq_restore(flags);
173} 175}
176EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 177
175#ifdef CONFIG_DEBUG_LOCK_ALLOC 178#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 179
@@ -192,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
192 */ 195 */
193int rcu_is_cpu_rrupt_from_idle(void) 196int rcu_is_cpu_rrupt_from_idle(void)
194{ 197{
195 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
196} 199}
197 200
198/* 201/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 918fd1e8509c..f85016a2309b 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -278,7 +278,7 @@ static int rcu_boost(void)
278 rcu_preempt_ctrlblk.exp_tasks == NULL) 278 rcu_preempt_ctrlblk.exp_tasks == NULL)
279 return 0; /* Nothing to boost. */ 279 return 0; /* Nothing to boost. */
280 280
281 raw_local_irq_save(flags); 281 local_irq_save(flags);
282 282
283 /* 283 /*
284 * Recheck with irqs disabled: all tasks in need of boosting 284 * Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +287,7 @@ static int rcu_boost(void)
287 */ 287 */
288 if (rcu_preempt_ctrlblk.boost_tasks == NULL && 288 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
289 rcu_preempt_ctrlblk.exp_tasks == NULL) { 289 rcu_preempt_ctrlblk.exp_tasks == NULL) {
290 raw_local_irq_restore(flags); 290 local_irq_restore(flags);
291 return 0; 291 return 0;
292 } 292 }
293 293
@@ -317,7 +317,7 @@ static int rcu_boost(void)
317 t = container_of(tb, struct task_struct, rcu_node_entry); 317 t = container_of(tb, struct task_struct, rcu_node_entry);
318 rt_mutex_init_proxy_locked(&mtx, t); 318 rt_mutex_init_proxy_locked(&mtx, t);
319 t->rcu_boost_mutex = &mtx; 319 t->rcu_boost_mutex = &mtx;
320 raw_local_irq_restore(flags); 320 local_irq_restore(flags);
321 rt_mutex_lock(&mtx); 321 rt_mutex_lock(&mtx);
322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
323 323
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
706 return; 706 return;
707 707
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 709 if (rcu_expedited)
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
710} 713}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 714EXPORT_SYMBOL_GPL(synchronize_rcu);
712 715
@@ -991,9 +994,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
991{ 994{
992 unsigned long flags; 995 unsigned long flags;
993 996
994 raw_local_irq_save(flags); 997 local_irq_save(flags);
995 rcp->qlen -= n; 998 rcp->qlen -= n;
996 raw_local_irq_restore(flags); 999 local_irq_restore(flags);
997} 1000}
998 1001
999/* 1002/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 25b15033c61f..31dea01c85fd 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre
53 53
54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
55static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
56static int stat_interval; /* Interval between stats, in seconds. */ 56static int stat_interval = 60; /* Interval between stats, in seconds. */
57 /* Defaults to "only at end of test". */ 57 /* Zero means "only at end of test". */
58static bool verbose; /* Print more debug info. */ 58static bool verbose; /* Print more debug info. */
59static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 59static bool test_no_idle_hz = true;
60 /* Test RCU support for tickless idle CPUs. */
60static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
61static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
62static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
119 120
120#define TORTURE_FLAG "-torture:" 121#define TORTURE_FLAG "-torture:"
121#define PRINTK_STRING(s) \ 122#define PRINTK_STRING(s) \
122 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 123 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
123#define VERBOSE_PRINTK_STRING(s) \ 124#define VERBOSE_PRINTK_STRING(s) \
124 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 125 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
125#define VERBOSE_PRINTK_ERRSTRING(s) \ 126#define VERBOSE_PRINTK_ERRSTRING(s) \
126 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 127 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
127 128
128static char printk_buf[4096]; 129static char printk_buf[4096];
129 130
@@ -176,8 +177,14 @@ static long n_rcu_torture_boosts;
176static long n_rcu_torture_timers; 177static long n_rcu_torture_timers;
177static long n_offline_attempts; 178static long n_offline_attempts;
178static long n_offline_successes; 179static long n_offline_successes;
180static unsigned long sum_offline;
181static int min_offline = -1;
182static int max_offline;
179static long n_online_attempts; 183static long n_online_attempts;
180static long n_online_successes; 184static long n_online_successes;
185static unsigned long sum_online;
186static int min_online = -1;
187static int max_online;
181static long n_barrier_attempts; 188static long n_barrier_attempts;
182static long n_barrier_successes; 189static long n_barrier_successes;
183static struct list_head rcu_torture_removed; 190static struct list_head rcu_torture_removed;
@@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
235 if (fullstop == FULLSTOP_DONTSTOP) 242 if (fullstop == FULLSTOP_DONTSTOP)
236 fullstop = FULLSTOP_SHUTDOWN; 243 fullstop = FULLSTOP_SHUTDOWN;
237 else 244 else
238 printk(KERN_WARNING /* but going down anyway, so... */ 245 pr_warn(/* but going down anyway, so... */
239 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 246 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
240 mutex_unlock(&fullstop_mutex); 247 mutex_unlock(&fullstop_mutex);
241 return NOTIFY_DONE; 248 return NOTIFY_DONE;
@@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
248static void rcutorture_shutdown_absorb(char *title) 255static void rcutorture_shutdown_absorb(char *title)
249{ 256{
250 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 257 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
251 printk(KERN_NOTICE 258 pr_notice(
252 "rcutorture thread %s parking due to system shutdown\n", 259 "rcutorture thread %s parking due to system shutdown\n",
253 title); 260 title);
254 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); 261 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -332,7 +339,6 @@ rcu_stutter_wait(char *title)
332 339
333struct rcu_torture_ops { 340struct rcu_torture_ops {
334 void (*init)(void); 341 void (*init)(void);
335 void (*cleanup)(void);
336 int (*readlock)(void); 342 int (*readlock)(void);
337 void (*read_delay)(struct rcu_random_state *rrsp); 343 void (*read_delay)(struct rcu_random_state *rrsp);
338 void (*readunlock)(int idx); 344 void (*readunlock)(int idx);
@@ -424,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
424 430
425static struct rcu_torture_ops rcu_ops = { 431static struct rcu_torture_ops rcu_ops = {
426 .init = NULL, 432 .init = NULL,
427 .cleanup = NULL,
428 .readlock = rcu_torture_read_lock, 433 .readlock = rcu_torture_read_lock,
429 .read_delay = rcu_read_delay, 434 .read_delay = rcu_read_delay,
430 .readunlock = rcu_torture_read_unlock, 435 .readunlock = rcu_torture_read_unlock,
@@ -468,7 +473,6 @@ static void rcu_sync_torture_init(void)
468 473
469static struct rcu_torture_ops rcu_sync_ops = { 474static struct rcu_torture_ops rcu_sync_ops = {
470 .init = rcu_sync_torture_init, 475 .init = rcu_sync_torture_init,
471 .cleanup = NULL,
472 .readlock = rcu_torture_read_lock, 476 .readlock = rcu_torture_read_lock,
473 .read_delay = rcu_read_delay, 477 .read_delay = rcu_read_delay,
474 .readunlock = rcu_torture_read_unlock, 478 .readunlock = rcu_torture_read_unlock,
@@ -486,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
486 490
487static struct rcu_torture_ops rcu_expedited_ops = { 491static struct rcu_torture_ops rcu_expedited_ops = {
488 .init = rcu_sync_torture_init, 492 .init = rcu_sync_torture_init,
489 .cleanup = NULL,
490 .readlock = rcu_torture_read_lock, 493 .readlock = rcu_torture_read_lock,
491 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
492 .readunlock = rcu_torture_read_unlock, 495 .readunlock = rcu_torture_read_unlock,
@@ -529,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
529 532
530static struct rcu_torture_ops rcu_bh_ops = { 533static struct rcu_torture_ops rcu_bh_ops = {
531 .init = NULL, 534 .init = NULL,
532 .cleanup = NULL,
533 .readlock = rcu_bh_torture_read_lock, 535 .readlock = rcu_bh_torture_read_lock,
534 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
535 .readunlock = rcu_bh_torture_read_unlock, 537 .readunlock = rcu_bh_torture_read_unlock,
@@ -546,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
546 548
547static struct rcu_torture_ops rcu_bh_sync_ops = { 549static struct rcu_torture_ops rcu_bh_sync_ops = {
548 .init = rcu_sync_torture_init, 550 .init = rcu_sync_torture_init,
549 .cleanup = NULL,
550 .readlock = rcu_bh_torture_read_lock, 551 .readlock = rcu_bh_torture_read_lock,
551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
552 .readunlock = rcu_bh_torture_read_unlock, 553 .readunlock = rcu_bh_torture_read_unlock,
@@ -563,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
563 564
564static struct rcu_torture_ops rcu_bh_expedited_ops = { 565static struct rcu_torture_ops rcu_bh_expedited_ops = {
565 .init = rcu_sync_torture_init, 566 .init = rcu_sync_torture_init,
566 .cleanup = NULL,
567 .readlock = rcu_bh_torture_read_lock, 567 .readlock = rcu_bh_torture_read_lock,
568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
569 .readunlock = rcu_bh_torture_read_unlock, 569 .readunlock = rcu_bh_torture_read_unlock,
@@ -582,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
582 * Definitions for srcu torture testing. 582 * Definitions for srcu torture testing.
583 */ 583 */
584 584
585static struct srcu_struct srcu_ctl; 585DEFINE_STATIC_SRCU(srcu_ctl);
586
587static void srcu_torture_init(void)
588{
589 init_srcu_struct(&srcu_ctl);
590 rcu_sync_torture_init();
591}
592
593static void srcu_torture_cleanup(void)
594{
595 synchronize_srcu(&srcu_ctl);
596 cleanup_srcu_struct(&srcu_ctl);
597}
598 586
599static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
600{ 588{
@@ -665,8 +653,7 @@ static int srcu_torture_stats(char *page)
665} 653}
666 654
667static struct rcu_torture_ops srcu_ops = { 655static struct rcu_torture_ops srcu_ops = {
668 .init = srcu_torture_init, 656 .init = rcu_sync_torture_init,
669 .cleanup = srcu_torture_cleanup,
670 .readlock = srcu_torture_read_lock, 657 .readlock = srcu_torture_read_lock,
671 .read_delay = srcu_read_delay, 658 .read_delay = srcu_read_delay,
672 .readunlock = srcu_torture_read_unlock, 659 .readunlock = srcu_torture_read_unlock,
@@ -680,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
680}; 667};
681 668
682static struct rcu_torture_ops srcu_sync_ops = { 669static struct rcu_torture_ops srcu_sync_ops = {
683 .init = srcu_torture_init, 670 .init = rcu_sync_torture_init,
684 .cleanup = srcu_torture_cleanup,
685 .readlock = srcu_torture_read_lock, 671 .readlock = srcu_torture_read_lock,
686 .read_delay = srcu_read_delay, 672 .read_delay = srcu_read_delay,
687 .readunlock = srcu_torture_read_unlock, 673 .readunlock = srcu_torture_read_unlock,
@@ -705,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
705} 691}
706 692
707static struct rcu_torture_ops srcu_raw_ops = { 693static struct rcu_torture_ops srcu_raw_ops = {
708 .init = srcu_torture_init, 694 .init = rcu_sync_torture_init,
709 .cleanup = srcu_torture_cleanup,
710 .readlock = srcu_torture_read_lock_raw, 695 .readlock = srcu_torture_read_lock_raw,
711 .read_delay = srcu_read_delay, 696 .read_delay = srcu_read_delay,
712 .readunlock = srcu_torture_read_unlock_raw, 697 .readunlock = srcu_torture_read_unlock_raw,
@@ -720,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
720}; 705};
721 706
722static struct rcu_torture_ops srcu_raw_sync_ops = { 707static struct rcu_torture_ops srcu_raw_sync_ops = {
723 .init = srcu_torture_init, 708 .init = rcu_sync_torture_init,
724 .cleanup = srcu_torture_cleanup,
725 .readlock = srcu_torture_read_lock_raw, 709 .readlock = srcu_torture_read_lock_raw,
726 .read_delay = srcu_read_delay, 710 .read_delay = srcu_read_delay,
727 .readunlock = srcu_torture_read_unlock_raw, 711 .readunlock = srcu_torture_read_unlock_raw,
@@ -740,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
740} 724}
741 725
742static struct rcu_torture_ops srcu_expedited_ops = { 726static struct rcu_torture_ops srcu_expedited_ops = {
743 .init = srcu_torture_init, 727 .init = rcu_sync_torture_init,
744 .cleanup = srcu_torture_cleanup,
745 .readlock = srcu_torture_read_lock, 728 .readlock = srcu_torture_read_lock,
746 .read_delay = srcu_read_delay, 729 .read_delay = srcu_read_delay,
747 .readunlock = srcu_torture_read_unlock, 730 .readunlock = srcu_torture_read_unlock,
@@ -776,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
776 759
777static struct rcu_torture_ops sched_ops = { 760static struct rcu_torture_ops sched_ops = {
778 .init = rcu_sync_torture_init, 761 .init = rcu_sync_torture_init,
779 .cleanup = NULL,
780 .readlock = sched_torture_read_lock, 762 .readlock = sched_torture_read_lock,
781 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
782 .readunlock = sched_torture_read_unlock, 764 .readunlock = sched_torture_read_unlock,
@@ -792,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
792 774
793static struct rcu_torture_ops sched_sync_ops = { 775static struct rcu_torture_ops sched_sync_ops = {
794 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
795 .cleanup = NULL,
796 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
797 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
798 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -807,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
807 788
808static struct rcu_torture_ops sched_expedited_ops = { 789static struct rcu_torture_ops sched_expedited_ops = {
809 .init = rcu_sync_torture_init, 790 .init = rcu_sync_torture_init,
810 .cleanup = NULL,
811 .readlock = sched_torture_read_lock, 791 .readlock = sched_torture_read_lock,
812 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
813 .readunlock = sched_torture_read_unlock, 793 .readunlock = sched_torture_read_unlock,
@@ -1214,11 +1194,13 @@ rcu_torture_printk(char *page)
1214 n_rcu_torture_boost_failure, 1194 n_rcu_torture_boost_failure,
1215 n_rcu_torture_boosts, 1195 n_rcu_torture_boosts,
1216 n_rcu_torture_timers); 1196 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", 1197 cnt += sprintf(&page[cnt],
1218 n_online_successes, 1198 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1219 n_online_attempts, 1199 n_online_successes, n_online_attempts,
1220 n_offline_successes, 1200 n_offline_successes, n_offline_attempts,
1221 n_offline_attempts); 1201 min_online, max_online,
1202 min_offline, max_offline,
1203 sum_online, sum_offline, HZ);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", 1204 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1223 n_barrier_successes, 1205 n_barrier_successes,
1224 n_barrier_attempts, 1206 n_barrier_attempts,
@@ -1267,7 +1249,7 @@ rcu_torture_stats_print(void)
1267 int cnt; 1249 int cnt;
1268 1250
1269 cnt = rcu_torture_printk(printk_buf); 1251 cnt = rcu_torture_printk(printk_buf);
1270 printk(KERN_ALERT "%s", printk_buf); 1252 pr_alert("%s", printk_buf);
1271} 1253}
1272 1254
1273/* 1255/*
@@ -1380,20 +1362,24 @@ rcu_torture_stutter(void *arg)
1380static inline void 1362static inline void
1381rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1363rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1382{ 1364{
1383 printk(KERN_ALERT "%s" TORTURE_FLAG 1365 pr_alert("%s" TORTURE_FLAG
1384 "--- %s: nreaders=%d nfakewriters=%d " 1366 "--- %s: nreaders=%d nfakewriters=%d "
1385 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1367 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1386 "shuffle_interval=%d stutter=%d irqreader=%d " 1368 "shuffle_interval=%d stutter=%d irqreader=%d "
1387 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1388 "test_boost=%d/%d test_boost_interval=%d " 1370 "test_boost=%d/%d test_boost_interval=%d "
1389 "test_boost_duration=%d shutdown_secs=%d " 1371 "test_boost_duration=%d shutdown_secs=%d "
1390 "onoff_interval=%d onoff_holdoff=%d\n", 1372 "stall_cpu=%d stall_cpu_holdoff=%d "
1391 torture_type, tag, nrealreaders, nfakewriters, 1373 "n_barrier_cbs=%d "
1392 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1374 "onoff_interval=%d onoff_holdoff=%d\n",
1393 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1375 torture_type, tag, nrealreaders, nfakewriters,
1394 test_boost, cur_ops->can_boost, 1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1395 test_boost_interval, test_boost_duration, shutdown_secs, 1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1396 onoff_interval, onoff_holdoff); 1378 test_boost, cur_ops->can_boost,
1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1382 onoff_interval, onoff_holdoff);
1397} 1383}
1398 1384
1399static struct notifier_block rcutorture_shutdown_nb = { 1385static struct notifier_block rcutorture_shutdown_nb = {
@@ -1460,9 +1446,9 @@ rcu_torture_shutdown(void *arg)
1460 !kthread_should_stop()) { 1446 !kthread_should_stop()) {
1461 delta = shutdown_time - jiffies_snap; 1447 delta = shutdown_time - jiffies_snap;
1462 if (verbose) 1448 if (verbose)
1463 printk(KERN_ALERT "%s" TORTURE_FLAG 1449 pr_alert("%s" TORTURE_FLAG
1464 "rcu_torture_shutdown task: %lu jiffies remaining\n", 1450 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1465 torture_type, delta); 1451 torture_type, delta);
1466 schedule_timeout_interruptible(delta); 1452 schedule_timeout_interruptible(delta);
1467 jiffies_snap = ACCESS_ONCE(jiffies); 1453 jiffies_snap = ACCESS_ONCE(jiffies);
1468 } 1454 }
@@ -1490,8 +1476,11 @@ static int __cpuinit
1490rcu_torture_onoff(void *arg) 1476rcu_torture_onoff(void *arg)
1491{ 1477{
1492 int cpu; 1478 int cpu;
1479 unsigned long delta;
1493 int maxcpu = -1; 1480 int maxcpu = -1;
1494 DEFINE_RCU_RANDOM(rand); 1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1483 unsigned long starttime;
1495 1484
1496 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1497 for_each_online_cpu(cpu) 1486 for_each_online_cpu(cpu)
@@ -1506,29 +1495,57 @@ rcu_torture_onoff(void *arg)
1506 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1495 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1507 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1496 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1508 if (verbose) 1497 if (verbose)
1509 printk(KERN_ALERT "%s" TORTURE_FLAG 1498 pr_alert("%s" TORTURE_FLAG
1510 "rcu_torture_onoff task: offlining %d\n", 1499 "rcu_torture_onoff task: offlining %d\n",
1511 torture_type, cpu); 1500 torture_type, cpu);
1501 starttime = jiffies;
1512 n_offline_attempts++; 1502 n_offline_attempts++;
1513 if (cpu_down(cpu) == 0) { 1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1514 if (verbose) 1510 if (verbose)
1515 printk(KERN_ALERT "%s" TORTURE_FLAG 1511 pr_alert("%s" TORTURE_FLAG
1516 "rcu_torture_onoff task: offlined %d\n", 1512 "rcu_torture_onoff task: offlined %d\n",
1517 torture_type, cpu); 1513 torture_type, cpu);
1518 n_offline_successes++; 1514 n_offline_successes++;
1515 delta = jiffies - starttime;
1516 sum_offline += delta;
1517 if (min_offline < 0) {
1518 min_offline = delta;
1519 max_offline = delta;
1520 }
1521 if (min_offline > delta)
1522 min_offline = delta;
1523 if (max_offline < delta)
1524 max_offline = delta;
1519 } 1525 }
1520 } else if (cpu_is_hotpluggable(cpu)) { 1526 } else if (cpu_is_hotpluggable(cpu)) {
1521 if (verbose) 1527 if (verbose)
1522 printk(KERN_ALERT "%s" TORTURE_FLAG 1528 pr_alert("%s" TORTURE_FLAG
1523 "rcu_torture_onoff task: onlining %d\n", 1529 "rcu_torture_onoff task: onlining %d\n",
1524 torture_type, cpu); 1530 torture_type, cpu);
1531 starttime = jiffies;
1525 n_online_attempts++; 1532 n_online_attempts++;
1526 if (cpu_up(cpu) == 0) { 1533 if (cpu_up(cpu) == 0) {
1527 if (verbose) 1534 if (verbose)
1528 printk(KERN_ALERT "%s" TORTURE_FLAG 1535 pr_alert("%s" TORTURE_FLAG
1529 "rcu_torture_onoff task: onlined %d\n", 1536 "rcu_torture_onoff task: onlined %d\n",
1530 torture_type, cpu); 1537 torture_type, cpu);
1531 n_online_successes++; 1538 n_online_successes++;
1539 delta = jiffies - starttime;
1540 sum_online += delta;
1541 if (min_online < 0) {
1542 min_online = delta;
1543 max_online = delta;
1544 }
1545 if (min_online > delta)
1546 min_online = delta;
1547 if (max_online < delta)
1548 max_online = delta;
1532 } 1549 }
1533 } 1550 }
1534 schedule_timeout_interruptible(onoff_interval * HZ); 1551 schedule_timeout_interruptible(onoff_interval * HZ);
@@ -1593,14 +1610,14 @@ static int __cpuinit rcu_torture_stall(void *args)
1593 if (!kthread_should_stop()) { 1610 if (!kthread_should_stop()) {
1594 stop_at = get_seconds() + stall_cpu; 1611 stop_at = get_seconds() + stall_cpu;
1595 /* RCU CPU stall is expected behavior in following code. */ 1612 /* RCU CPU stall is expected behavior in following code. */
1596 printk(KERN_ALERT "rcu_torture_stall start.\n"); 1613 pr_alert("rcu_torture_stall start.\n");
1597 rcu_read_lock(); 1614 rcu_read_lock();
1598 preempt_disable(); 1615 preempt_disable();
1599 while (ULONG_CMP_LT(get_seconds(), stop_at)) 1616 while (ULONG_CMP_LT(get_seconds(), stop_at))
1600 continue; /* Induce RCU CPU stall warning. */ 1617 continue; /* Induce RCU CPU stall warning. */
1601 preempt_enable(); 1618 preempt_enable();
1602 rcu_read_unlock(); 1619 rcu_read_unlock();
1603 printk(KERN_ALERT "rcu_torture_stall end.\n"); 1620 pr_alert("rcu_torture_stall end.\n");
1604 } 1621 }
1605 rcutorture_shutdown_absorb("rcu_torture_stall"); 1622 rcutorture_shutdown_absorb("rcu_torture_stall");
1606 while (!kthread_should_stop()) 1623 while (!kthread_should_stop())
@@ -1716,12 +1733,12 @@ static int rcu_torture_barrier_init(void)
1716 if (n_barrier_cbs == 0) 1733 if (n_barrier_cbs == 0)
1717 return 0; 1734 return 0;
1718 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { 1735 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1719 printk(KERN_ALERT "%s" TORTURE_FLAG 1736 pr_alert("%s" TORTURE_FLAG
1720 " Call or barrier ops missing for %s,\n", 1737 " Call or barrier ops missing for %s,\n",
1721 torture_type, cur_ops->name); 1738 torture_type, cur_ops->name);
1722 printk(KERN_ALERT "%s" TORTURE_FLAG 1739 pr_alert("%s" TORTURE_FLAG
1723 " RCU barrier testing omitted from run.\n", 1740 " RCU barrier testing omitted from run.\n",
1724 torture_type); 1741 torture_type);
1725 return 0; 1742 return 0;
1726 } 1743 }
1727 atomic_set(&barrier_cbs_count, 0); 1744 atomic_set(&barrier_cbs_count, 0);
@@ -1814,7 +1831,7 @@ rcu_torture_cleanup(void)
1814 mutex_lock(&fullstop_mutex); 1831 mutex_lock(&fullstop_mutex);
1815 rcutorture_record_test_transition(); 1832 rcutorture_record_test_transition();
1816 if (fullstop == FULLSTOP_SHUTDOWN) { 1833 if (fullstop == FULLSTOP_SHUTDOWN) {
1817 printk(KERN_WARNING /* but going down anyway, so... */ 1834 pr_warn(/* but going down anyway, so... */
1818 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1835 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1819 mutex_unlock(&fullstop_mutex); 1836 mutex_unlock(&fullstop_mutex);
1820 schedule_timeout_uninterruptible(10); 1837 schedule_timeout_uninterruptible(10);
@@ -1903,8 +1920,6 @@ rcu_torture_cleanup(void)
1903 1920
1904 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1905 1922
1906 if (cur_ops->cleanup)
1907 cur_ops->cleanup();
1908 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1909 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1910 else if (n_online_successes != n_online_attempts || 1925 else if (n_online_successes != n_online_attempts ||
@@ -1938,17 +1953,17 @@ rcu_torture_init(void)
1938 break; 1953 break;
1939 } 1954 }
1940 if (i == ARRAY_SIZE(torture_ops)) { 1955 if (i == ARRAY_SIZE(torture_ops)) {
1941 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", 1956 pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
1942 torture_type); 1957 torture_type);
1943 printk(KERN_ALERT "rcu-torture types:"); 1958 pr_alert("rcu-torture types:");
1944 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1959 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1945 printk(KERN_ALERT " %s", torture_ops[i]->name); 1960 pr_alert(" %s", torture_ops[i]->name);
1946 printk(KERN_ALERT "\n"); 1961 pr_alert("\n");
1947 mutex_unlock(&fullstop_mutex); 1962 mutex_unlock(&fullstop_mutex);
1948 return -EINVAL; 1963 return -EINVAL;
1949 } 1964 }
1950 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1965 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); 1966 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1952 fqs_duration = 0; 1967 fqs_duration = 0;
1953 } 1968 }
1954 if (cur_ops->init) 1969 if (cur_ops->init)
@@ -1996,14 +2011,15 @@ rcu_torture_init(void)
1996 /* Start up the kthreads. */ 2011 /* Start up the kthreads. */
1997 2012
1998 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 2013 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
1999 writer_task = kthread_run(rcu_torture_writer, NULL, 2014 writer_task = kthread_create(rcu_torture_writer, NULL,
2000 "rcu_torture_writer"); 2015 "rcu_torture_writer");
2001 if (IS_ERR(writer_task)) { 2016 if (IS_ERR(writer_task)) {
2002 firsterr = PTR_ERR(writer_task); 2017 firsterr = PTR_ERR(writer_task);
2003 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 2018 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
2004 writer_task = NULL; 2019 writer_task = NULL;
2005 goto unwind; 2020 goto unwind;
2006 } 2021 }
2022 wake_up_process(writer_task);
2007 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 2023 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
2008 GFP_KERNEL); 2024 GFP_KERNEL);
2009 if (fakewriter_tasks == NULL) { 2025 if (fakewriter_tasks == NULL) {
@@ -2118,14 +2134,15 @@ rcu_torture_init(void)
2118 } 2134 }
2119 if (shutdown_secs > 0) { 2135 if (shutdown_secs > 0) {
2120 shutdown_time = jiffies + shutdown_secs * HZ; 2136 shutdown_time = jiffies + shutdown_secs * HZ;
2121 shutdown_task = kthread_run(rcu_torture_shutdown, NULL, 2137 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2122 "rcu_torture_shutdown"); 2138 "rcu_torture_shutdown");
2123 if (IS_ERR(shutdown_task)) { 2139 if (IS_ERR(shutdown_task)) {
2124 firsterr = PTR_ERR(shutdown_task); 2140 firsterr = PTR_ERR(shutdown_task);
2125 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); 2141 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2126 shutdown_task = NULL; 2142 shutdown_task = NULL;
2127 goto unwind; 2143 goto unwind;
2128 } 2144 }
2145 wake_up_process(shutdown_task);
2129 } 2146 }
2130 i = rcu_torture_onoff_init(); 2147 i = rcu_torture_onoff_init();
2131 if (i != 0) { 2148 if (i != 0) {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e542e3e9..e441b77b614e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,6 +52,7 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h>
55 56
56#include "rcutree.h" 57#include "rcutree.h"
57#include <trace/events/rcu.h> 58#include <trace/events/rcu.h>
@@ -61,18 +62,19 @@
61/* Data structures. */ 62/* Data structures. */
62 63
63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
64 66
65#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
67 .call = cr, \ 69 .call = cr, \
68 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
69 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
70 .completed = -300, \ 72 .completed = 0UL - 300UL, \
71 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
73 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
76 .name = #sname, \ 78 .name = #sname, \
77} 79}
78 80
@@ -88,7 +90,7 @@ LIST_HEAD(rcu_struct_flavors);
88 90
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 91/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; 92static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0); 93module_param(rcu_fanout_leaf, int, 0444);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 94int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ 95static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0, 96 NUM_RCU_LVL_0,
@@ -133,13 +135,12 @@ static int rcu_scheduler_fully_active __read_mostly;
133 */ 135 */
134static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); 136static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
135DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
136DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 138DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
138DEFINE_PER_CPU(char, rcu_cpu_has_work); 139DEFINE_PER_CPU(char, rcu_cpu_has_work);
139 140
140#endif /* #ifdef CONFIG_RCU_BOOST */ 141#endif /* #ifdef CONFIG_RCU_BOOST */
141 142
142static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 143static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
143static void invoke_rcu_core(void); 144static void invoke_rcu_core(void);
144static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 145static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145 146
@@ -175,8 +176,6 @@ void rcu_sched_qs(int cpu)
175{ 176{
176 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 177 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
177 178
178 rdp->passed_quiesce_gpnum = rdp->gpnum;
179 barrier();
180 if (rdp->passed_quiesce == 0) 179 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 180 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
182 rdp->passed_quiesce = 1; 181 rdp->passed_quiesce = 1;
@@ -186,8 +185,6 @@ void rcu_bh_qs(int cpu)
186{ 185{
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 186 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 187
189 rdp->passed_quiesce_gpnum = rdp->gpnum;
190 barrier();
191 if (rdp->passed_quiesce == 0) 188 if (rdp->passed_quiesce == 0)
192 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 189 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
193 rdp->passed_quiesce = 1; 190 rdp->passed_quiesce = 1;
@@ -212,13 +209,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212 .dynticks = ATOMIC_INIT(1), 209 .dynticks = ATOMIC_INIT(1),
213}; 210};
214 211
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 213static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 214static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 215
219module_param(blimit, int, 0); 216module_param(blimit, long, 0444);
220module_param(qhimark, int, 0); 217module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0); 218module_param(qlowmark, long, 0444);
222 219
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -226,7 +223,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
226module_param(rcu_cpu_stall_suppress, int, 0644); 223module_param(rcu_cpu_stall_suppress, int, 0644);
227module_param(rcu_cpu_stall_timeout, int, 0644); 224module_param(rcu_cpu_stall_timeout, int, 0644);
228 225
229static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 226static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228
229module_param(jiffies_till_first_fqs, ulong, 0644);
230module_param(jiffies_till_next_fqs, ulong, 0644);
231
232static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
233static void force_quiescent_state(struct rcu_state *rsp);
230static int rcu_pending(int cpu); 234static int rcu_pending(int cpu);
231 235
232/* 236/*
@@ -252,7 +256,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
252 */ 256 */
253void rcu_bh_force_quiescent_state(void) 257void rcu_bh_force_quiescent_state(void)
254{ 258{
255 force_quiescent_state(&rcu_bh_state, 0); 259 force_quiescent_state(&rcu_bh_state);
256} 260}
257EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 261EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
258 262
@@ -286,7 +290,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
286 */ 290 */
287void rcu_sched_force_quiescent_state(void) 291void rcu_sched_force_quiescent_state(void)
288{ 292{
289 force_quiescent_state(&rcu_sched_state, 0); 293 force_quiescent_state(&rcu_sched_state);
290} 294}
291EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 295EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
292 296
@@ -296,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
296static int 300static int
297cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 301cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
298{ 302{
299 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; 303 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
304 rdp->nxttail[RCU_DONE_TAIL] != NULL;
300} 305}
301 306
302/* 307/*
@@ -305,7 +310,12 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305static int 310static int
306cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
307{ 312{
308 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); 313 struct rcu_head **ntp;
314
315 ntp = rdp->nxttail[RCU_DONE_TAIL +
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
318 !rcu_gp_in_progress(rsp);
309} 319}
310 320
311/* 321/*
@@ -317,45 +327,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317} 327}
318 328
319/* 329/*
320 * If the specified CPU is offline, tell the caller that it is in 330 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
321 * a quiescent state. Otherwise, whack it with a reschedule IPI.
322 * Grace periods can end up waiting on an offline CPU when that
323 * CPU is in the process of coming online -- it will be added to the
324 * rcu_node bitmasks before it actually makes it online. The same thing
325 * can happen while a CPU is in the process of coming online. Because this
326 * race is quite rare, we check for it after detecting that the grace
327 * period has been delayed rather than checking each and every CPU
328 * each and every time we start a new grace period.
329 */
330static int rcu_implicit_offline_qs(struct rcu_data *rdp)
331{
332 /*
333 * If the CPU is offline for more than a jiffy, it is in a quiescent
334 * state. We can trust its state not to change because interrupts
335 * are disabled. The reason for the jiffy's worth of slack is to
336 * handle CPUs initializing on the way up and finding their way
337 * to the idle loop on the way down.
338 */
339 if (cpu_is_offline(rdp->cpu) &&
340 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
341 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
342 rdp->offline_fqs++;
343 return 1;
344 }
345 return 0;
346}
347
348/*
349 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
350 * 331 *
351 * If the new value of the ->dynticks_nesting counter now is zero, 332 * If the new value of the ->dynticks_nesting counter now is zero,
352 * we really have entered idle, and must do the appropriate accounting. 333 * we really have entered idle, and must do the appropriate accounting.
353 * The caller must have disabled interrupts. 334 * The caller must have disabled interrupts.
354 */ 335 */
355static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) 336static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337 bool user)
356{ 338{
357 trace_rcu_dyntick("Start", oldval, 0); 339 trace_rcu_dyntick("Start", oldval, 0);
358 if (!is_idle_task(current)) { 340 if (!user && !is_idle_task(current)) {
359 struct task_struct *idle = idle_task(smp_processor_id()); 341 struct task_struct *idle = idle_task(smp_processor_id());
360 342
361 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 343 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
@@ -372,7 +354,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 354 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 355
374 /* 356 /*
375 * The idle task is not permitted to enter the idle loop while 357 * It is illegal to enter an extended quiescent state while
376 * in an RCU read-side critical section. 358 * in an RCU read-side critical section.
377 */ 359 */
378 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), 360 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
@@ -383,6 +365,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
383 "Illegal idle entry in RCU-sched read-side critical section."); 365 "Illegal idle entry in RCU-sched read-side critical section.");
384} 366}
385 367
368/*
369 * Enter an RCU extended quiescent state, which can be either the
370 * idle loop or adaptive-tickless usermode execution.
371 */
372static void rcu_eqs_enter(bool user)
373{
374 long long oldval;
375 struct rcu_dynticks *rdtp;
376
377 rdtp = &__get_cpu_var(rcu_dynticks);
378 oldval = rdtp->dynticks_nesting;
379 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
380 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
381 rdtp->dynticks_nesting = 0;
382 else
383 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
384 rcu_eqs_enter_common(rdtp, oldval, user);
385}
386
386/** 387/**
387 * rcu_idle_enter - inform RCU that current CPU is entering idle 388 * rcu_idle_enter - inform RCU that current CPU is entering idle
388 * 389 *
@@ -398,21 +399,48 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
398void rcu_idle_enter(void) 399void rcu_idle_enter(void)
399{ 400{
400 unsigned long flags; 401 unsigned long flags;
401 long long oldval; 402
403 local_irq_save(flags);
404 rcu_eqs_enter(false);
405 local_irq_restore(flags);
406}
407EXPORT_SYMBOL_GPL(rcu_idle_enter);
408
409#ifdef CONFIG_RCU_USER_QS
410/**
411 * rcu_user_enter - inform RCU that we are resuming userspace.
412 *
413 * Enter RCU idle mode right before resuming userspace. No use of RCU
414 * is permitted between this call and rcu_user_exit(). This way the
415 * CPU doesn't need to maintain the tick for RCU maintenance purposes
416 * when the CPU runs in userspace.
417 */
418void rcu_user_enter(void)
419{
420 rcu_eqs_enter(1);
421}
422
423/**
424 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
425 * after the current irq returns.
426 *
427 * This is similar to rcu_user_enter() but in the context of a non-nesting
428 * irq. After this call, RCU enters into idle mode when the interrupt
429 * returns.
430 */
431void rcu_user_enter_after_irq(void)
432{
433 unsigned long flags;
402 struct rcu_dynticks *rdtp; 434 struct rcu_dynticks *rdtp;
403 435
404 local_irq_save(flags); 436 local_irq_save(flags);
405 rdtp = &__get_cpu_var(rcu_dynticks); 437 rdtp = &__get_cpu_var(rcu_dynticks);
406 oldval = rdtp->dynticks_nesting; 438 /* Ensure this irq is interrupting a non-idle RCU state. */
407 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 439 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
408 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 440 rdtp->dynticks_nesting = 1;
409 rdtp->dynticks_nesting = 0;
410 else
411 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
412 rcu_idle_enter_common(rdtp, oldval);
413 local_irq_restore(flags); 441 local_irq_restore(flags);
414} 442}
415EXPORT_SYMBOL_GPL(rcu_idle_enter); 443#endif /* CONFIG_RCU_USER_QS */
416 444
417/** 445/**
418 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 446 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -444,18 +472,19 @@ void rcu_irq_exit(void)
444 if (rdtp->dynticks_nesting) 472 if (rdtp->dynticks_nesting)
445 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 473 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
446 else 474 else
447 rcu_idle_enter_common(rdtp, oldval); 475 rcu_eqs_enter_common(rdtp, oldval, true);
448 local_irq_restore(flags); 476 local_irq_restore(flags);
449} 477}
450 478
451/* 479/*
452 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle 480 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
453 * 481 *
454 * If the new value of the ->dynticks_nesting counter was previously zero, 482 * If the new value of the ->dynticks_nesting counter was previously zero,
455 * we really have exited idle, and must do the appropriate accounting. 483 * we really have exited idle, and must do the appropriate accounting.
456 * The caller must have disabled interrupts. 484 * The caller must have disabled interrupts.
457 */ 485 */
458static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) 486static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
487 int user)
459{ 488{
460 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 489 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
461 atomic_inc(&rdtp->dynticks); 490 atomic_inc(&rdtp->dynticks);
@@ -464,7 +493,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
464 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 493 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
465 rcu_cleanup_after_idle(smp_processor_id()); 494 rcu_cleanup_after_idle(smp_processor_id());
466 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 495 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
467 if (!is_idle_task(current)) { 496 if (!user && !is_idle_task(current)) {
468 struct task_struct *idle = idle_task(smp_processor_id()); 497 struct task_struct *idle = idle_task(smp_processor_id());
469 498
470 trace_rcu_dyntick("Error on exit: not idle task", 499 trace_rcu_dyntick("Error on exit: not idle task",
@@ -476,6 +505,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
476 } 505 }
477} 506}
478 507
508/*
509 * Exit an RCU extended quiescent state, which can be either the
510 * idle loop or adaptive-tickless usermode execution.
511 */
512static void rcu_eqs_exit(bool user)
513{
514 struct rcu_dynticks *rdtp;
515 long long oldval;
516
517 rdtp = &__get_cpu_var(rcu_dynticks);
518 oldval = rdtp->dynticks_nesting;
519 WARN_ON_ONCE(oldval < 0);
520 if (oldval & DYNTICK_TASK_NEST_MASK)
521 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
522 else
523 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
524 rcu_eqs_exit_common(rdtp, oldval, user);
525}
526
479/** 527/**
480 * rcu_idle_exit - inform RCU that current CPU is leaving idle 528 * rcu_idle_exit - inform RCU that current CPU is leaving idle
481 * 529 *
@@ -490,21 +538,47 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
490void rcu_idle_exit(void) 538void rcu_idle_exit(void)
491{ 539{
492 unsigned long flags; 540 unsigned long flags;
541
542 local_irq_save(flags);
543 rcu_eqs_exit(false);
544 local_irq_restore(flags);
545}
546EXPORT_SYMBOL_GPL(rcu_idle_exit);
547
548#ifdef CONFIG_RCU_USER_QS
549/**
550 * rcu_user_exit - inform RCU that we are exiting userspace.
551 *
552 * Exit RCU idle mode while entering the kernel because it can
553 * run a RCU read side critical section anytime.
554 */
555void rcu_user_exit(void)
556{
557 rcu_eqs_exit(1);
558}
559
560/**
561 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
562 * idle mode after the current non-nesting irq returns.
563 *
564 * This is similar to rcu_user_exit() but in the context of an irq.
565 * This is called when the irq has interrupted a userspace RCU idle mode
566 * context. When the current non-nesting interrupt returns after this call,
567 * the CPU won't restore the RCU idle mode.
568 */
569void rcu_user_exit_after_irq(void)
570{
571 unsigned long flags;
493 struct rcu_dynticks *rdtp; 572 struct rcu_dynticks *rdtp;
494 long long oldval;
495 573
496 local_irq_save(flags); 574 local_irq_save(flags);
497 rdtp = &__get_cpu_var(rcu_dynticks); 575 rdtp = &__get_cpu_var(rcu_dynticks);
498 oldval = rdtp->dynticks_nesting; 576 /* Ensure we are interrupting an RCU idle mode. */
499 WARN_ON_ONCE(oldval < 0); 577 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
500 if (oldval & DYNTICK_TASK_NEST_MASK) 578 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
501 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
502 else
503 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
504 rcu_idle_exit_common(rdtp, oldval);
505 local_irq_restore(flags); 579 local_irq_restore(flags);
506} 580}
507EXPORT_SYMBOL_GPL(rcu_idle_exit); 581#endif /* CONFIG_RCU_USER_QS */
508 582
509/** 583/**
510 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 584 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -539,7 +613,7 @@ void rcu_irq_enter(void)
539 if (oldval) 613 if (oldval)
540 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 614 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
541 else 615 else
542 rcu_idle_exit_common(rdtp, oldval); 616 rcu_eqs_exit_common(rdtp, oldval, true);
543 local_irq_restore(flags); 617 local_irq_restore(flags);
544} 618}
545 619
@@ -673,7 +747,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
673 * Return true if the specified CPU has passed through a quiescent 747 * Return true if the specified CPU has passed through a quiescent
674 * state by virtue of being in or having passed through an dynticks 748 * state by virtue of being in or having passed through an dynticks
675 * idle state since the last call to dyntick_save_progress_counter() 749 * idle state since the last call to dyntick_save_progress_counter()
676 * for this same CPU. 750 * for this same CPU, or by virtue of having been offline.
677 */ 751 */
678static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 752static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
679{ 753{
@@ -697,8 +771,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
697 return 1; 771 return 1;
698 } 772 }
699 773
700 /* Go check for the CPU being offline. */ 774 /*
701 return rcu_implicit_offline_qs(rdp); 775 * Check for the CPU being offline, but only if the grace period
776 * is old enough. We don't need to worry about the CPU changing
777 * state: If we see it offline even once, it has been through a
778 * quiescent state.
779 *
780 * The reason for insisting that the grace period be at least
781 * one jiffy old is that CPUs that are not quite online and that
782 * have just gone offline can still execute RCU read-side critical
783 * sections.
784 */
785 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
786 return 0; /* Grace period is not old enough. */
787 barrier();
788 if (cpu_is_offline(rdp->cpu)) {
789 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
790 rdp->offline_fqs++;
791 return 1;
792 }
793 return 0;
702} 794}
703 795
704static int jiffies_till_stall_check(void) 796static int jiffies_till_stall_check(void)
@@ -725,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
725 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
726} 818}
727 819
820/*
821 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
822 * for architectures that do not implement trigger_all_cpu_backtrace().
823 * The NMI-triggered stack traces are more accurate because they are
824 * printed by the target CPU.
825 */
826static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
827{
828 int cpu;
829 unsigned long flags;
830 struct rcu_node *rnp;
831
832 rcu_for_each_leaf_node(rsp, rnp) {
833 raw_spin_lock_irqsave(&rnp->lock, flags);
834 if (rnp->qsmask != 0) {
835 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
836 if (rnp->qsmask & (1UL << cpu))
837 dump_cpu_task(rnp->grplo + cpu);
838 }
839 raw_spin_unlock_irqrestore(&rnp->lock, flags);
840 }
841}
842
728static void print_other_cpu_stall(struct rcu_state *rsp) 843static void print_other_cpu_stall(struct rcu_state *rsp)
729{ 844{
730 int cpu; 845 int cpu;
@@ -732,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
732 unsigned long flags; 847 unsigned long flags;
733 int ndetected = 0; 848 int ndetected = 0;
734 struct rcu_node *rnp = rcu_get_root(rsp); 849 struct rcu_node *rnp = rcu_get_root(rsp);
850 long totqlen = 0;
735 851
736 /* Only let one CPU complain about others per time interval. */ 852 /* Only let one CPU complain about others per time interval. */
737 853
@@ -755,14 +871,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
755 rcu_for_each_leaf_node(rsp, rnp) { 871 rcu_for_each_leaf_node(rsp, rnp) {
756 raw_spin_lock_irqsave(&rnp->lock, flags); 872 raw_spin_lock_irqsave(&rnp->lock, flags);
757 ndetected += rcu_print_task_stall(rnp); 873 ndetected += rcu_print_task_stall(rnp);
874 if (rnp->qsmask != 0) {
875 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
876 if (rnp->qsmask & (1UL << cpu)) {
877 print_cpu_stall_info(rsp,
878 rnp->grplo + cpu);
879 ndetected++;
880 }
881 }
758 raw_spin_unlock_irqrestore(&rnp->lock, flags); 882 raw_spin_unlock_irqrestore(&rnp->lock, flags);
759 if (rnp->qsmask == 0)
760 continue;
761 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
762 if (rnp->qsmask & (1UL << cpu)) {
763 print_cpu_stall_info(rsp, rnp->grplo + cpu);
764 ndetected++;
765 }
766 } 883 }
767 884
768 /* 885 /*
@@ -775,24 +892,29 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
775 raw_spin_unlock_irqrestore(&rnp->lock, flags); 892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
776 893
777 print_cpu_stall_info_end(); 894 print_cpu_stall_info_end();
778 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 895 for_each_possible_cpu(cpu)
779 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 896 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
897 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
898 smp_processor_id(), (long)(jiffies - rsp->gp_start),
899 rsp->gpnum, rsp->completed, totqlen);
780 if (ndetected == 0) 900 if (ndetected == 0)
781 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 901 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
782 else if (!trigger_all_cpu_backtrace()) 902 else if (!trigger_all_cpu_backtrace())
783 dump_stack(); 903 rcu_dump_cpu_stacks(rsp);
784 904
785 /* If so configured, complain about tasks blocking the grace period. */ 905 /* Complain about tasks blocking the grace period. */
786 906
787 rcu_print_detail_task_stall(rsp); 907 rcu_print_detail_task_stall(rsp);
788 908
789 force_quiescent_state(rsp, 0); /* Kick them all. */ 909 force_quiescent_state(rsp); /* Kick them all. */
790} 910}
791 911
792static void print_cpu_stall(struct rcu_state *rsp) 912static void print_cpu_stall(struct rcu_state *rsp)
793{ 913{
914 int cpu;
794 unsigned long flags; 915 unsigned long flags;
795 struct rcu_node *rnp = rcu_get_root(rsp); 916 struct rcu_node *rnp = rcu_get_root(rsp);
917 long totqlen = 0;
796 918
797 /* 919 /*
798 * OK, time to rat on ourselves... 920 * OK, time to rat on ourselves...
@@ -803,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
803 print_cpu_stall_info_begin(); 925 print_cpu_stall_info_begin();
804 print_cpu_stall_info(rsp, smp_processor_id()); 926 print_cpu_stall_info(rsp, smp_processor_id());
805 print_cpu_stall_info_end(); 927 print_cpu_stall_info_end();
806 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 928 for_each_possible_cpu(cpu)
929 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
930 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
931 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
807 if (!trigger_all_cpu_backtrace()) 932 if (!trigger_all_cpu_backtrace())
808 dump_stack(); 933 dump_stack();
809 934
@@ -827,7 +952,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
827 j = ACCESS_ONCE(jiffies); 952 j = ACCESS_ONCE(jiffies);
828 js = ACCESS_ONCE(rsp->jiffies_stall); 953 js = ACCESS_ONCE(rsp->jiffies_stall);
829 rnp = rdp->mynode; 954 rnp = rdp->mynode;
830 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 955 if (rcu_gp_in_progress(rsp) &&
956 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
831 957
832 /* We haven't checked in, so go dump stack. */ 958 /* We haven't checked in, so go dump stack. */
833 print_cpu_stall(rsp); 959 print_cpu_stall(rsp);
@@ -889,12 +1015,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
889 */ 1015 */
890 rdp->gpnum = rnp->gpnum; 1016 rdp->gpnum = rnp->gpnum;
891 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1017 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
892 if (rnp->qsmask & rdp->grpmask) { 1018 rdp->passed_quiesce = 0;
893 rdp->qs_pending = 1; 1019 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
894 rdp->passed_quiesce = 0;
895 } else {
896 rdp->qs_pending = 0;
897 }
898 zero_cpu_stall_ticks(rdp); 1020 zero_cpu_stall_ticks(rdp);
899 } 1021 }
900} 1022}
@@ -945,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp)
945 rdp->nxtlist = NULL; 1067 rdp->nxtlist = NULL;
946 for (i = 0; i < RCU_NEXT_SIZE; i++) 1068 for (i = 0; i < RCU_NEXT_SIZE; i++)
947 rdp->nxttail[i] = &rdp->nxtlist; 1069 rdp->nxttail[i] = &rdp->nxtlist;
1070 init_nocb_callback_list(rdp);
948} 1071}
949 1072
950/* 1073/*
@@ -974,10 +1097,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
974 * our behalf. Catch up with this state to avoid noting 1097 * our behalf. Catch up with this state to avoid noting
975 * spurious new grace periods. If another grace period 1098 * spurious new grace periods. If another grace period
976 * has started, then rnp->gpnum will have advanced, so 1099 * has started, then rnp->gpnum will have advanced, so
977 * we will detect this later on. 1100 * we will detect this later on. Of course, any quiescent
1101 * states we found for the old GP are now invalid.
978 */ 1102 */
979 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) 1103 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
980 rdp->gpnum = rdp->completed; 1104 rdp->gpnum = rdp->completed;
1105 rdp->passed_quiesce = 0;
1106 }
981 1107
982 /* 1108 /*
983 * If RCU does not need a quiescent state from this CPU, 1109 * If RCU does not need a quiescent state from this CPU,
@@ -1021,97 +1147,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1021 /* Prior grace period ended, so advance callbacks for current CPU. */ 1147 /* Prior grace period ended, so advance callbacks for current CPU. */
1022 __rcu_process_gp_end(rsp, rnp, rdp); 1148 __rcu_process_gp_end(rsp, rnp, rdp);
1023 1149
1024 /*
1025 * Because this CPU just now started the new grace period, we know
1026 * that all of its callbacks will be covered by this upcoming grace
1027 * period, even the ones that were registered arbitrarily recently.
1028 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
1029 *
1030 * Other CPUs cannot be sure exactly when the grace period started.
1031 * Therefore, their recently registered callbacks must pass through
1032 * an additional RCU_NEXT_READY stage, so that they will be handled
1033 * by the next RCU grace period.
1034 */
1035 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1036 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1037
1038 /* Set state so that this CPU will detect the next quiescent state. */ 1150 /* Set state so that this CPU will detect the next quiescent state. */
1039 __note_new_gpnum(rsp, rnp, rdp); 1151 __note_new_gpnum(rsp, rnp, rdp);
1040} 1152}
1041 1153
1042/* 1154/*
1043 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1155 * Initialize a new grace period.
1044 * in preparation for detecting the next grace period. The caller must hold
1045 * the root node's ->lock, which is released before return. Hard irqs must
1046 * be disabled.
1047 *
1048 * Note that it is legal for a dying CPU (which is marked as offline) to
1049 * invoke this function. This can happen when the dying CPU reports its
1050 * quiescent state.
1051 */ 1156 */
1052static void 1157static int rcu_gp_init(struct rcu_state *rsp)
1053rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1054 __releases(rcu_get_root(rsp)->lock)
1055{ 1158{
1056 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1159 struct rcu_data *rdp;
1057 struct rcu_node *rnp = rcu_get_root(rsp); 1160 struct rcu_node *rnp = rcu_get_root(rsp);
1058 1161
1059 if (!rcu_scheduler_fully_active || 1162 raw_spin_lock_irq(&rnp->lock);
1060 !cpu_needs_another_gp(rsp, rdp)) { 1163 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1061 /*
1062 * Either the scheduler hasn't yet spawned the first
1063 * non-idle task or this CPU does not need another
1064 * grace period. Either way, don't start a new grace
1065 * period.
1066 */
1067 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1068 return;
1069 }
1070 1164
1071 if (rsp->fqs_active) { 1165 if (rcu_gp_in_progress(rsp)) {
1072 /* 1166 /* Grace period already in progress, don't start another. */
1073 * This CPU needs a grace period, but force_quiescent_state() 1167 raw_spin_unlock_irq(&rnp->lock);
1074 * is running. Tell it to start one on this CPU's behalf. 1168 return 0;
1075 */
1076 rsp->fqs_need_gp = 1;
1077 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1078 return;
1079 } 1169 }
1080 1170
1081 /* Advance to a new grace period and initialize state. */ 1171 /* Advance to a new grace period and initialize state. */
1082 rsp->gpnum++; 1172 rsp->gpnum++;
1083 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1173 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
1084 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
1085 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
1086 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1087 record_gp_stall_check_time(rsp); 1174 record_gp_stall_check_time(rsp);
1088 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1175 raw_spin_unlock_irq(&rnp->lock);
1089 1176
1090 /* Exclude any concurrent CPU-hotplug operations. */ 1177 /* Exclude any concurrent CPU-hotplug operations. */
1091 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1178 mutex_lock(&rsp->onoff_mutex);
1092 1179
1093 /* 1180 /*
1094 * Set the quiescent-state-needed bits in all the rcu_node 1181 * Set the quiescent-state-needed bits in all the rcu_node
1095 * structures for all currently online CPUs in breadth-first 1182 * structures for all currently online CPUs in breadth-first order,
1096 * order, starting from the root rcu_node structure. This 1183 * starting from the root rcu_node structure, relying on the layout
1097 * operation relies on the layout of the hierarchy within the 1184 * of the tree within the rsp->node[] array. Note that other CPUs
1098 * rsp->node[] array. Note that other CPUs will access only 1185 * will access only the leaves of the hierarchy, thus seeing that no
1099 * the leaves of the hierarchy, which still indicate that no
1100 * grace period is in progress, at least until the corresponding 1186 * grace period is in progress, at least until the corresponding
1101 * leaf node has been initialized. In addition, we have excluded 1187 * leaf node has been initialized. In addition, we have excluded
1102 * CPU-hotplug operations. 1188 * CPU-hotplug operations.
1103 * 1189 *
1104 * Note that the grace period cannot complete until we finish 1190 * The grace period cannot complete until the initialization
1105 * the initialization process, as there will be at least one 1191 * process finishes, because this kthread handles both.
1106 * qsmask bit set in the root node until that time, namely the
1107 * one corresponding to this CPU, due to the fact that we have
1108 * irqs disabled.
1109 */ 1192 */
1110 rcu_for_each_node_breadth_first(rsp, rnp) { 1193 rcu_for_each_node_breadth_first(rsp, rnp) {
1111 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1194 raw_spin_lock_irq(&rnp->lock);
1195 rdp = this_cpu_ptr(rsp->rda);
1112 rcu_preempt_check_blocked_tasks(rnp); 1196 rcu_preempt_check_blocked_tasks(rnp);
1113 rnp->qsmask = rnp->qsmaskinit; 1197 rnp->qsmask = rnp->qsmaskinit;
1114 rnp->gpnum = rsp->gpnum; 1198 rnp->gpnum = rsp->gpnum;
1199 WARN_ON_ONCE(rnp->completed != rsp->completed);
1115 rnp->completed = rsp->completed; 1200 rnp->completed = rsp->completed;
1116 if (rnp == rdp->mynode) 1201 if (rnp == rdp->mynode)
1117 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1202 rcu_start_gp_per_cpu(rsp, rnp, rdp);
@@ -1119,37 +1204,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1119 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1204 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1120 rnp->level, rnp->grplo, 1205 rnp->level, rnp->grplo,
1121 rnp->grphi, rnp->qsmask); 1206 rnp->grphi, rnp->qsmask);
1122 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1207 raw_spin_unlock_irq(&rnp->lock);
1208#ifdef CONFIG_PROVE_RCU_DELAY
1209 if ((random32() % (rcu_num_nodes * 8)) == 0)
1210 schedule_timeout_uninterruptible(2);
1211#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1212 cond_resched();
1123 } 1213 }
1124 1214
1125 rnp = rcu_get_root(rsp); 1215 mutex_unlock(&rsp->onoff_mutex);
1126 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1216 return 1;
1127 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
1128 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1129 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1130} 1217}
1131 1218
1132/* 1219/*
1133 * Report a full set of quiescent states to the specified rcu_state 1220 * Do one round of quiescent-state forcing.
1134 * data structure. This involves cleaning up after the prior grace
1135 * period and letting rcu_start_gp() start up the next grace period
1136 * if one is needed. Note that the caller must hold rnp->lock, as
1137 * required by rcu_start_gp(), which will release it.
1138 */ 1221 */
1139static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1222int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1140 __releases(rcu_get_root(rsp)->lock)
1141{ 1223{
1142 unsigned long gp_duration; 1224 int fqs_state = fqs_state_in;
1143 struct rcu_node *rnp = rcu_get_root(rsp); 1225 struct rcu_node *rnp = rcu_get_root(rsp);
1144 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1145 1226
1146 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1227 rsp->n_force_qs++;
1228 if (fqs_state == RCU_SAVE_DYNTICK) {
1229 /* Collect dyntick-idle snapshots. */
1230 force_qs_rnp(rsp, dyntick_save_progress_counter);
1231 fqs_state = RCU_FORCE_QS;
1232 } else {
1233 /* Handle dyntick-idle and offline CPUs. */
1234 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1235 }
1236 /* Clear flag to prevent immediate re-entry. */
1237 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1238 raw_spin_lock_irq(&rnp->lock);
1239 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1240 raw_spin_unlock_irq(&rnp->lock);
1241 }
1242 return fqs_state;
1243}
1147 1244
1148 /* 1245/*
1149 * Ensure that all grace-period and pre-grace-period activity 1246 * Clean up after the old grace period.
1150 * is seen before the assignment to rsp->completed. 1247 */
1151 */ 1248static void rcu_gp_cleanup(struct rcu_state *rsp)
1152 smp_mb(); /* See above block comment. */ 1249{
1250 unsigned long gp_duration;
1251 struct rcu_data *rdp;
1252 struct rcu_node *rnp = rcu_get_root(rsp);
1253
1254 raw_spin_lock_irq(&rnp->lock);
1153 gp_duration = jiffies - rsp->gp_start; 1255 gp_duration = jiffies - rsp->gp_start;
1154 if (gp_duration > rsp->gp_max) 1256 if (gp_duration > rsp->gp_max)
1155 rsp->gp_max = gp_duration; 1257 rsp->gp_max = gp_duration;
@@ -1161,35 +1263,171 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1161 * they can do to advance the grace period. It is therefore 1263 * they can do to advance the grace period. It is therefore
1162 * safe for us to drop the lock in order to mark the grace 1264 * safe for us to drop the lock in order to mark the grace
1163 * period as completed in all of the rcu_node structures. 1265 * period as completed in all of the rcu_node structures.
1164 *
1165 * But if this CPU needs another grace period, it will take
1166 * care of this while initializing the next grace period.
1167 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
1168 * because the callbacks have not yet been advanced: Those
1169 * callbacks are waiting on the grace period that just now
1170 * completed.
1171 */ 1266 */
1172 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { 1267 raw_spin_unlock_irq(&rnp->lock);
1173 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1174 1268
1175 /* 1269 /*
1176 * Propagate new ->completed value to rcu_node structures 1270 * Propagate new ->completed value to rcu_node structures so
1177 * so that other CPUs don't have to wait until the start 1271 * that other CPUs don't have to wait until the start of the next
1178 * of the next grace period to process their callbacks. 1272 * grace period to process their callbacks. This also avoids
1179 */ 1273 * some nasty RCU grace-period initialization races by forcing
1180 rcu_for_each_node_breadth_first(rsp, rnp) { 1274 * the end of the current grace period to be completely recorded in
1181 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1275 * all of the rcu_node structures before the beginning of the next
1182 rnp->completed = rsp->gpnum; 1276 * grace period is recorded in any of the rcu_node structures.
1183 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1277 */
1184 } 1278 rcu_for_each_node_breadth_first(rsp, rnp) {
1185 rnp = rcu_get_root(rsp); 1279 raw_spin_lock_irq(&rnp->lock);
1186 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1280 rnp->completed = rsp->gpnum;
1281 raw_spin_unlock_irq(&rnp->lock);
1282 cond_resched();
1187 } 1283 }
1284 rnp = rcu_get_root(rsp);
1285 raw_spin_lock_irq(&rnp->lock);
1188 1286
1189 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1287 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1190 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1288 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1191 rsp->fqs_state = RCU_GP_IDLE; 1289 rsp->fqs_state = RCU_GP_IDLE;
1192 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1290 rdp = this_cpu_ptr(rsp->rda);
1291 if (cpu_needs_another_gp(rsp, rdp))
1292 rsp->gp_flags = 1;
1293 raw_spin_unlock_irq(&rnp->lock);
1294}
1295
1296/*
1297 * Body of kthread that handles grace periods.
1298 */
1299static int __noreturn rcu_gp_kthread(void *arg)
1300{
1301 int fqs_state;
1302 unsigned long j;
1303 int ret;
1304 struct rcu_state *rsp = arg;
1305 struct rcu_node *rnp = rcu_get_root(rsp);
1306
1307 for (;;) {
1308
1309 /* Handle grace-period start. */
1310 for (;;) {
1311 wait_event_interruptible(rsp->gp_wq,
1312 rsp->gp_flags &
1313 RCU_GP_FLAG_INIT);
1314 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
1315 rcu_gp_init(rsp))
1316 break;
1317 cond_resched();
1318 flush_signals(current);
1319 }
1320
1321 /* Handle quiescent-state forcing. */
1322 fqs_state = RCU_SAVE_DYNTICK;
1323 j = jiffies_till_first_fqs;
1324 if (j > HZ) {
1325 j = HZ;
1326 jiffies_till_first_fqs = HZ;
1327 }
1328 for (;;) {
1329 rsp->jiffies_force_qs = jiffies + j;
1330 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1331 (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
1332 (!ACCESS_ONCE(rnp->qsmask) &&
1333 !rcu_preempt_blocked_readers_cgp(rnp)),
1334 j);
1335 /* If grace period done, leave loop. */
1336 if (!ACCESS_ONCE(rnp->qsmask) &&
1337 !rcu_preempt_blocked_readers_cgp(rnp))
1338 break;
1339 /* If time for quiescent-state forcing, do it. */
1340 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
1341 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1342 cond_resched();
1343 } else {
1344 /* Deal with stray signal. */
1345 cond_resched();
1346 flush_signals(current);
1347 }
1348 j = jiffies_till_next_fqs;
1349 if (j > HZ) {
1350 j = HZ;
1351 jiffies_till_next_fqs = HZ;
1352 } else if (j < 1) {
1353 j = 1;
1354 jiffies_till_next_fqs = 1;
1355 }
1356 }
1357
1358 /* Handle grace-period end. */
1359 rcu_gp_cleanup(rsp);
1360 }
1361}
1362
1363/*
1364 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1365 * in preparation for detecting the next grace period. The caller must hold
1366 * the root node's ->lock, which is released before return. Hard irqs must
1367 * be disabled.
1368 *
1369 * Note that it is legal for a dying CPU (which is marked as offline) to
1370 * invoke this function. This can happen when the dying CPU reports its
1371 * quiescent state.
1372 */
1373static void
1374rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1375 __releases(rcu_get_root(rsp)->lock)
1376{
1377 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1378 struct rcu_node *rnp = rcu_get_root(rsp);
1379
1380 if (!rsp->gp_kthread ||
1381 !cpu_needs_another_gp(rsp, rdp)) {
1382 /*
1383 * Either we have not yet spawned the grace-period
1384 * task, this CPU does not need another grace period,
1385 * or a grace period is already in progress.
1386 * Either way, don't start a new grace period.
1387 */
1388 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1389 return;
1390 }
1391
1392 /*
1393 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be
1396 * handled after the end of the next grace period. If the
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406
1407 rsp->gp_flags = RCU_GP_FLAG_INIT;
1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1409
1410 /* Ensure that CPU is aware of completion of last grace period. */
1411 rcu_process_gp_end(rsp, rdp);
1412 local_irq_restore(flags);
1413
1414 /* Wake up rcu_gp_kthread() to start the grace period. */
1415 wake_up(&rsp->gp_wq);
1416}
1417
1418/*
1419 * Report a full set of quiescent states to the specified rcu_state
1420 * data structure. This involves cleaning up after the prior grace
1421 * period and letting rcu_start_gp() start up the next grace period
1422 * if one is needed. Note that the caller must hold rnp->lock, as
1423 * required by rcu_start_gp(), which will release it.
1424 */
1425static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1426 __releases(rcu_get_root(rsp)->lock)
1427{
1428 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1429 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1430 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1193} 1431}
1194 1432
1195/* 1433/*
@@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1258 * based on quiescent states detected in an earlier grace period! 1496 * based on quiescent states detected in an earlier grace period!
1259 */ 1497 */
1260static void 1498static void
1261rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) 1499rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1262{ 1500{
1263 unsigned long flags; 1501 unsigned long flags;
1264 unsigned long mask; 1502 unsigned long mask;
@@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1266 1504
1267 rnp = rdp->mynode; 1505 rnp = rdp->mynode;
1268 raw_spin_lock_irqsave(&rnp->lock, flags); 1506 raw_spin_lock_irqsave(&rnp->lock, flags);
1269 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { 1507 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1508 rnp->completed == rnp->gpnum) {
1270 1509
1271 /* 1510 /*
1272 * The grace period in which this quiescent state was 1511 * The grace period in which this quiescent state was
@@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1325 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1564 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1326 * judge of that). 1565 * judge of that).
1327 */ 1566 */
1328 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); 1567 rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
1329} 1568}
1330 1569
1331#ifdef CONFIG_HOTPLUG_CPU 1570#ifdef CONFIG_HOTPLUG_CPU
@@ -1333,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1333/* 1572/*
1334 * Send the specified CPU's RCU callbacks to the orphanage. The 1573 * Send the specified CPU's RCU callbacks to the orphanage. The
1335 * specified CPU must be offline, and the caller must hold the 1574 * specified CPU must be offline, and the caller must hold the
1336 * ->onofflock. 1575 * ->orphan_lock.
1337 */ 1576 */
1338static void 1577static void
1339rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1578rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1340 struct rcu_node *rnp, struct rcu_data *rdp) 1579 struct rcu_node *rnp, struct rcu_data *rdp)
1341{ 1580{
1581 /* No-CBs CPUs do not have orphanable callbacks. */
1582 if (is_nocb_cpu(rdp->cpu))
1583 return;
1584
1342 /* 1585 /*
1343 * Orphan the callbacks. First adjust the counts. This is safe 1586 * Orphan the callbacks. First adjust the counts. This is safe
1344 * because ->onofflock excludes _rcu_barrier()'s adoption of 1587 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1345 * the callbacks, thus no memory barrier is required. 1588 * cannot be running now. Thus no memory barrier is required.
1346 */ 1589 */
1347 if (rdp->nxtlist != NULL) { 1590 if (rdp->nxtlist != NULL) {
1348 rsp->qlen_lazy += rdp->qlen_lazy; 1591 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1383,22 +1626,15 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1383 1626
1384/* 1627/*
1385 * Adopt the RCU callbacks from the specified rcu_state structure's 1628 * Adopt the RCU callbacks from the specified rcu_state structure's
1386 * orphanage. The caller must hold the ->onofflock. 1629 * orphanage. The caller must hold the ->orphan_lock.
1387 */ 1630 */
1388static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1631static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1389{ 1632{
1390 int i; 1633 int i;
1391 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1634 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1392 1635
1393 /* 1636 /* No-CBs CPUs are handled specially. */
1394 * If there is an rcu_barrier() operation in progress, then 1637 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1395 * only the task doing that operation is permitted to adopt
1396 * callbacks. To do otherwise breaks rcu_barrier() and friends
1397 * by causing them to fail to wait for the callbacks in the
1398 * orphanage.
1399 */
1400 if (rsp->rcu_barrier_in_progress &&
1401 rsp->rcu_barrier_in_progress != current)
1402 return; 1638 return;
1403 1639
1404 /* Do the accounting first. */ 1640 /* Do the accounting first. */
@@ -1455,9 +1691,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1455 * The CPU has been completely removed, and some other CPU is reporting 1691 * The CPU has been completely removed, and some other CPU is reporting
1456 * this fact from process context. Do the remainder of the cleanup, 1692 * this fact from process context. Do the remainder of the cleanup,
1457 * including orphaning the outgoing CPU's RCU callbacks, and also 1693 * including orphaning the outgoing CPU's RCU callbacks, and also
1458 * adopting them, if there is no _rcu_barrier() instance running. 1694 * adopting them. There can only be one CPU hotplug operation at a time,
1459 * There can only be one CPU hotplug operation at a time, so no other 1695 * so no other CPU can be attempting to update rcu_cpu_kthread_task.
1460 * CPU can be attempting to update rcu_cpu_kthread_task.
1461 */ 1696 */
1462static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 1697static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1463{ 1698{
@@ -1468,13 +1703,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1468 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 1703 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1469 1704
1470 /* Adjust any no-longer-needed kthreads. */ 1705 /* Adjust any no-longer-needed kthreads. */
1471 rcu_stop_cpu_kthread(cpu); 1706 rcu_boost_kthread_setaffinity(rnp, -1);
1472 rcu_node_kthread_setaffinity(rnp, -1);
1473 1707
1474 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1708 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1475 1709
1476 /* Exclude any attempts to start a new grace period. */ 1710 /* Exclude any attempts to start a new grace period. */
1477 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1711 mutex_lock(&rsp->onoff_mutex);
1712 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1478 1713
1479 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1714 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1480 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1715 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1501,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1501 /* 1736 /*
1502 * We still hold the leaf rcu_node structure lock here, and 1737 * We still hold the leaf rcu_node structure lock here, and
1503 * irqs are still disabled. The reason for this subterfuge is 1738 * irqs are still disabled. The reason for this subterfuge is
1504 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1739 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1505 * held leads to deadlock. 1740 * held leads to deadlock.
1506 */ 1741 */
1507 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1742 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1508 rnp = rdp->mynode; 1743 rnp = rdp->mynode;
1509 if (need_report & RCU_OFL_TASKS_NORM_GP) 1744 if (need_report & RCU_OFL_TASKS_NORM_GP)
1510 rcu_report_unblock_qs_rnp(rnp, flags); 1745 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1515,14 +1750,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 1750 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 1751 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist); 1752 cpu, rdp->qlen, rdp->nxtlist);
1753 init_callback_list(rdp);
1754 /* Disallow further callbacks on this CPU. */
1755 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1756 mutex_unlock(&rsp->onoff_mutex);
1518} 1757}
1519 1758
1520#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1759#else /* #ifdef CONFIG_HOTPLUG_CPU */
1521 1760
1522static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1523{
1524}
1525
1526static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1761static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1527{ 1762{
1528} 1763}
@@ -1541,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1541{ 1776{
1542 unsigned long flags; 1777 unsigned long flags;
1543 struct rcu_head *next, *list, **tail; 1778 struct rcu_head *next, *list, **tail;
1544 int bl, count, count_lazy, i; 1779 long bl, count, count_lazy;
1780 int i;
1545 1781
1546 /* If no callbacks are ready, just return.*/ 1782 /* If no callbacks are ready, just return.*/
1547 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1687,6 +1923,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1687 struct rcu_node *rnp; 1923 struct rcu_node *rnp;
1688 1924
1689 rcu_for_each_leaf_node(rsp, rnp) { 1925 rcu_for_each_leaf_node(rsp, rnp) {
1926 cond_resched();
1690 mask = 0; 1927 mask = 0;
1691 raw_spin_lock_irqsave(&rnp->lock, flags); 1928 raw_spin_lock_irqsave(&rnp->lock, flags);
1692 if (!rcu_gp_in_progress(rsp)) { 1929 if (!rcu_gp_in_progress(rsp)) {
@@ -1723,72 +1960,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1723 * Force quiescent states on reluctant CPUs, and also detect which 1960 * Force quiescent states on reluctant CPUs, and also detect which
1724 * CPUs are in dyntick-idle mode. 1961 * CPUs are in dyntick-idle mode.
1725 */ 1962 */
1726static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1963static void force_quiescent_state(struct rcu_state *rsp)
1727{ 1964{
1728 unsigned long flags; 1965 unsigned long flags;
1729 struct rcu_node *rnp = rcu_get_root(rsp); 1966 bool ret;
1730 1967 struct rcu_node *rnp;
1731 trace_rcu_utilization("Start fqs"); 1968 struct rcu_node *rnp_old = NULL;
1732 if (!rcu_gp_in_progress(rsp)) { 1969
1733 trace_rcu_utilization("End fqs"); 1970 /* Funnel through hierarchy to reduce memory contention. */
1734 return; /* No grace period in progress, nothing to force. */ 1971 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
1735 } 1972 for (; rnp != NULL; rnp = rnp->parent) {
1736 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1973 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
1737 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1974 !raw_spin_trylock(&rnp->fqslock);
1738 trace_rcu_utilization("End fqs"); 1975 if (rnp_old != NULL)
1739 return; /* Someone else is already on the job. */ 1976 raw_spin_unlock(&rnp_old->fqslock);
1740 } 1977 if (ret) {
1741 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1978 rsp->n_force_qs_lh++;
1742 goto unlock_fqs_ret; /* no emergency and done recently. */ 1979 return;
1743 rsp->n_force_qs++; 1980 }
1744 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1981 rnp_old = rnp;
1745 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1746 if(!rcu_gp_in_progress(rsp)) {
1747 rsp->n_force_qs_ngp++;
1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1749 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1750 }
1751 rsp->fqs_active = 1;
1752 switch (rsp->fqs_state) {
1753 case RCU_GP_IDLE:
1754 case RCU_GP_INIT:
1755
1756 break; /* grace period idle or initializing, ignore. */
1757
1758 case RCU_SAVE_DYNTICK:
1759
1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1761
1762 /* Record dyntick-idle state. */
1763 force_qs_rnp(rsp, dyntick_save_progress_counter);
1764 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1765 if (rcu_gp_in_progress(rsp))
1766 rsp->fqs_state = RCU_FORCE_QS;
1767 break;
1768
1769 case RCU_FORCE_QS:
1770
1771 /* Check dyntick-idle state, send IPI to laggarts. */
1772 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1773 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1774
1775 /* Leave state in case more forcing is required. */
1776
1777 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1778 break;
1779 } 1982 }
1780 rsp->fqs_active = 0; 1983 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
1781 if (rsp->fqs_need_gp) { 1984
1782 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1985 /* Reached the root of the rcu_node tree, acquire lock. */
1783 rsp->fqs_need_gp = 0; 1986 raw_spin_lock_irqsave(&rnp_old->lock, flags);
1784 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1987 raw_spin_unlock(&rnp_old->fqslock);
1785 trace_rcu_utilization("End fqs"); 1988 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1786 return; 1989 rsp->n_force_qs_lh++;
1990 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1991 return; /* Someone beat us to it. */
1787 } 1992 }
1788 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1993 rsp->gp_flags |= RCU_GP_FLAG_FQS;
1789unlock_fqs_ret: 1994 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1790 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1995 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1791 trace_rcu_utilization("End fqs");
1792} 1996}
1793 1997
1794/* 1998/*
@@ -1805,13 +2009,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
1805 WARN_ON_ONCE(rdp->beenonline == 0); 2009 WARN_ON_ONCE(rdp->beenonline == 0);
1806 2010
1807 /* 2011 /*
1808 * If an RCU GP has gone long enough, go check for dyntick
1809 * idle CPUs and, if needed, send resched IPIs.
1810 */
1811 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1812 force_quiescent_state(rsp, 1);
1813
1814 /*
1815 * Advance callbacks in response to end of earlier grace 2012 * Advance callbacks in response to end of earlier grace
1816 * period that some other CPU ended. 2013 * period that some other CPU ended.
1817 */ 2014 */
@@ -1838,6 +2035,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1838{ 2035{
1839 struct rcu_state *rsp; 2036 struct rcu_state *rsp;
1840 2037
2038 if (cpu_is_offline(smp_processor_id()))
2039 return;
1841 trace_rcu_utilization("Start RCU core"); 2040 trace_rcu_utilization("Start RCU core");
1842 for_each_rcu_flavor(rsp) 2041 for_each_rcu_flavor(rsp)
1843 __rcu_process_callbacks(rsp); 2042 __rcu_process_callbacks(rsp);
@@ -1909,17 +2108,22 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1909 rdp->blimit = LONG_MAX; 2108 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap && 2109 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head) 2110 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0); 2111 force_quiescent_state(rsp);
1913 rdp->n_force_qs_snap = rsp->n_force_qs; 2112 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen; 2113 rdp->qlen_last_fqs_check = rdp->qlen;
1915 } 2114 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 2115 }
1917 force_quiescent_state(rsp, 1);
1918} 2116}
1919 2117
2118/*
2119 * Helper function for call_rcu() and friends. The cpu argument will
2120 * normally be -1, indicating "currently running CPU". It may specify
2121 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2122 * is expected to specify a CPU.
2123 */
1920static void 2124static void
1921__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2125__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1922 struct rcu_state *rsp, bool lazy) 2126 struct rcu_state *rsp, int cpu, bool lazy)
1923{ 2127{
1924 unsigned long flags; 2128 unsigned long flags;
1925 struct rcu_data *rdp; 2129 struct rcu_data *rdp;
@@ -1929,8 +2133,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1929 head->func = func; 2133 head->func = func;
1930 head->next = NULL; 2134 head->next = NULL;
1931 2135
1932 smp_mb(); /* Ensure RCU update seen before callback registry. */
1933
1934 /* 2136 /*
1935 * Opportunistically note grace-period endings and beginnings. 2137 * Opportunistically note grace-period endings and beginnings.
1936 * Note that we might see a beginning right after we see an 2138 * Note that we might see a beginning right after we see an
@@ -1941,6 +2143,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1941 rdp = this_cpu_ptr(rsp->rda); 2143 rdp = this_cpu_ptr(rsp->rda);
1942 2144
1943 /* Add the callback to our list. */ 2145 /* Add the callback to our list. */
2146 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2147 int offline;
2148
2149 if (cpu != -1)
2150 rdp = per_cpu_ptr(rsp->rda, cpu);
2151 offline = !__call_rcu_nocb(rdp, head, lazy);
2152 WARN_ON_ONCE(offline);
2153 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2154 local_irq_restore(flags);
2155 return;
2156 }
1944 ACCESS_ONCE(rdp->qlen)++; 2157 ACCESS_ONCE(rdp->qlen)++;
1945 if (lazy) 2158 if (lazy)
1946 rdp->qlen_lazy++; 2159 rdp->qlen_lazy++;
@@ -1966,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1966 */ 2179 */
1967void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2180void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1968{ 2181{
1969 __call_rcu(head, func, &rcu_sched_state, 0); 2182 __call_rcu(head, func, &rcu_sched_state, -1, 0);
1970} 2183}
1971EXPORT_SYMBOL_GPL(call_rcu_sched); 2184EXPORT_SYMBOL_GPL(call_rcu_sched);
1972 2185
@@ -1975,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1975 */ 2188 */
1976void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2189void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1977{ 2190{
1978 __call_rcu(head, func, &rcu_bh_state, 0); 2191 __call_rcu(head, func, &rcu_bh_state, -1, 0);
1979} 2192}
1980EXPORT_SYMBOL_GPL(call_rcu_bh); 2193EXPORT_SYMBOL_GPL(call_rcu_bh);
1981 2194
@@ -2011,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void)
2011 * rcu_read_lock_sched(). 2224 * rcu_read_lock_sched().
2012 * 2225 *
2013 * This means that all preempt_disable code sequences, including NMI and 2226 * This means that all preempt_disable code sequences, including NMI and
2014 * hardware-interrupt handlers, in progress on entry will have completed 2227 * non-threaded hardware-interrupt handlers, in progress on entry will
2015 * before this primitive returns. However, this does not guarantee that 2228 * have completed before this primitive returns. However, this does not
2016 * softirq handlers will have completed, since in some kernels, these 2229 * guarantee that softirq handlers will have completed, since in some
2017 * handlers can run in process context, and can block. 2230 * kernels, these handlers can run in process context, and can block.
2231 *
2232 * Note that this guarantee implies further memory-ordering guarantees.
2233 * On systems with more than one CPU, when synchronize_sched() returns,
2234 * each CPU is guaranteed to have executed a full memory barrier since the
2235 * end of its last RCU-sched read-side critical section whose beginning
2236 * preceded the call to synchronize_sched(). In addition, each CPU having
2237 * an RCU read-side critical section that extends beyond the return from
2238 * synchronize_sched() is guaranteed to have executed a full memory barrier
2239 * after the beginning of synchronize_sched() and before the beginning of
2240 * that RCU read-side critical section. Note that these guarantees include
2241 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2242 * that are executing in the kernel.
2243 *
2244 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2245 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2246 * to have executed a full memory barrier during the execution of
2247 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2248 * again only if the system has more than one CPU).
2018 * 2249 *
2019 * This primitive provides the guarantees made by the (now removed) 2250 * This primitive provides the guarantees made by the (now removed)
2020 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2251 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2030,7 +2261,10 @@ void synchronize_sched(void)
2030 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2261 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2031 if (rcu_blocking_is_gp()) 2262 if (rcu_blocking_is_gp())
2032 return; 2263 return;
2033 wait_rcu_gp(call_rcu_sched); 2264 if (rcu_expedited)
2265 synchronize_sched_expedited();
2266 else
2267 wait_rcu_gp(call_rcu_sched);
2034} 2268}
2035EXPORT_SYMBOL_GPL(synchronize_sched); 2269EXPORT_SYMBOL_GPL(synchronize_sched);
2036 2270
@@ -2042,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2042 * read-side critical sections have completed. RCU read-side critical 2276 * read-side critical sections have completed. RCU read-side critical
2043 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2277 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2044 * and may be nested. 2278 * and may be nested.
2279 *
2280 * See the description of synchronize_sched() for more detailed information
2281 * on memory ordering guarantees.
2045 */ 2282 */
2046void synchronize_rcu_bh(void) 2283void synchronize_rcu_bh(void)
2047{ 2284{
@@ -2051,13 +2288,13 @@ void synchronize_rcu_bh(void)
2051 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2288 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2052 if (rcu_blocking_is_gp()) 2289 if (rcu_blocking_is_gp())
2053 return; 2290 return;
2054 wait_rcu_gp(call_rcu_bh); 2291 if (rcu_expedited)
2292 synchronize_rcu_bh_expedited();
2293 else
2294 wait_rcu_gp(call_rcu_bh);
2055} 2295}
2056EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2296EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2057 2297
2058static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2059static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2060
2061static int synchronize_sched_expedited_cpu_stop(void *data) 2298static int synchronize_sched_expedited_cpu_stop(void *data)
2062{ 2299{
2063 /* 2300 /*
@@ -2114,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2114 */ 2351 */
2115void synchronize_sched_expedited(void) 2352void synchronize_sched_expedited(void)
2116{ 2353{
2117 int firstsnap, s, snap, trycount = 0; 2354 long firstsnap, s, snap;
2355 int trycount = 0;
2356 struct rcu_state *rsp = &rcu_sched_state;
2118 2357
2119 /* Note that atomic_inc_return() implies full memory barrier. */ 2358 /*
2120 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2359 * If we are in danger of counter wrap, just do synchronize_sched().
2360 * By allowing sync_sched_expedited_started to advance no more than
2361 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2362 * that more than 3.5 billion CPUs would be required to force a
2363 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2364 * course be required on a 64-bit system.
2365 */
2366 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2367 (ulong)atomic_long_read(&rsp->expedited_done) +
2368 ULONG_MAX / 8)) {
2369 synchronize_sched();
2370 atomic_long_inc(&rsp->expedited_wrap);
2371 return;
2372 }
2373
2374 /*
2375 * Take a ticket. Note that atomic_inc_return() implies a
2376 * full memory barrier.
2377 */
2378 snap = atomic_long_inc_return(&rsp->expedited_start);
2379 firstsnap = snap;
2121 get_online_cpus(); 2380 get_online_cpus();
2122 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2381 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2123 2382
@@ -2129,48 +2388,65 @@ void synchronize_sched_expedited(void)
2129 synchronize_sched_expedited_cpu_stop, 2388 synchronize_sched_expedited_cpu_stop,
2130 NULL) == -EAGAIN) { 2389 NULL) == -EAGAIN) {
2131 put_online_cpus(); 2390 put_online_cpus();
2391 atomic_long_inc(&rsp->expedited_tryfail);
2392
2393 /* Check to see if someone else did our work for us. */
2394 s = atomic_long_read(&rsp->expedited_done);
2395 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2396 /* ensure test happens before caller kfree */
2397 smp_mb__before_atomic_inc(); /* ^^^ */
2398 atomic_long_inc(&rsp->expedited_workdone1);
2399 return;
2400 }
2132 2401
2133 /* No joy, try again later. Or just synchronize_sched(). */ 2402 /* No joy, try again later. Or just synchronize_sched(). */
2134 if (trycount++ < 10) { 2403 if (trycount++ < 10) {
2135 udelay(trycount * num_online_cpus()); 2404 udelay(trycount * num_online_cpus());
2136 } else { 2405 } else {
2137 synchronize_sched(); 2406 wait_rcu_gp(call_rcu_sched);
2407 atomic_long_inc(&rsp->expedited_normal);
2138 return; 2408 return;
2139 } 2409 }
2140 2410
2141 /* Check to see if someone else did our work for us. */ 2411 /* Recheck to see if someone else did our work for us. */
2142 s = atomic_read(&sync_sched_expedited_done); 2412 s = atomic_long_read(&rsp->expedited_done);
2143 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2413 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2144 smp_mb(); /* ensure test happens before caller kfree */ 2414 /* ensure test happens before caller kfree */
2415 smp_mb__before_atomic_inc(); /* ^^^ */
2416 atomic_long_inc(&rsp->expedited_workdone2);
2145 return; 2417 return;
2146 } 2418 }
2147 2419
2148 /* 2420 /*
2149 * Refetching sync_sched_expedited_started allows later 2421 * Refetching sync_sched_expedited_started allows later
2150 * callers to piggyback on our grace period. We subtract 2422 * callers to piggyback on our grace period. We retry
2151 * 1 to get the same token that the last incrementer got. 2423 * after they started, so our grace period works for them,
2152 * We retry after they started, so our grace period works 2424 * and they started after our first try, so their grace
2153 * for them, and they started after our first try, so their 2425 * period works for us.
2154 * grace period works for us.
2155 */ 2426 */
2156 get_online_cpus(); 2427 get_online_cpus();
2157 snap = atomic_read(&sync_sched_expedited_started); 2428 snap = atomic_long_read(&rsp->expedited_start);
2158 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2429 smp_mb(); /* ensure read is before try_stop_cpus(). */
2159 } 2430 }
2431 atomic_long_inc(&rsp->expedited_stoppedcpus);
2160 2432
2161 /* 2433 /*
2162 * Everyone up to our most recent fetch is covered by our grace 2434 * Everyone up to our most recent fetch is covered by our grace
2163 * period. Update the counter, but only if our work is still 2435 * period. Update the counter, but only if our work is still
2164 * relevant -- which it won't be if someone who started later 2436 * relevant -- which it won't be if someone who started later
2165 * than we did beat us to the punch. 2437 * than we did already did their update.
2166 */ 2438 */
2167 do { 2439 do {
2168 s = atomic_read(&sync_sched_expedited_done); 2440 atomic_long_inc(&rsp->expedited_done_tries);
2169 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2441 s = atomic_long_read(&rsp->expedited_done);
2170 smp_mb(); /* ensure test happens before caller kfree */ 2442 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2443 /* ensure test happens before caller kfree */
2444 smp_mb__before_atomic_inc(); /* ^^^ */
2445 atomic_long_inc(&rsp->expedited_done_lost);
2171 break; 2446 break;
2172 } 2447 }
2173 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2448 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2449 atomic_long_inc(&rsp->expedited_done_exit);
2174 2450
2175 put_online_cpus(); 2451 put_online_cpus();
2176} 2452}
@@ -2195,17 +2471,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2195 /* Is the RCU core waiting for a quiescent state from this CPU? */ 2471 /* Is the RCU core waiting for a quiescent state from this CPU? */
2196 if (rcu_scheduler_fully_active && 2472 if (rcu_scheduler_fully_active &&
2197 rdp->qs_pending && !rdp->passed_quiesce) { 2473 rdp->qs_pending && !rdp->passed_quiesce) {
2198
2199 /*
2200 * If force_quiescent_state() coming soon and this CPU
2201 * needs a quiescent state, and this is either RCU-sched
2202 * or RCU-bh, force a local reschedule.
2203 */
2204 rdp->n_rp_qs_pending++; 2474 rdp->n_rp_qs_pending++;
2205 if (!rdp->preemptible &&
2206 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
2207 jiffies))
2208 set_need_resched();
2209 } else if (rdp->qs_pending && rdp->passed_quiesce) { 2475 } else if (rdp->qs_pending && rdp->passed_quiesce) {
2210 rdp->n_rp_report_qs++; 2476 rdp->n_rp_report_qs++;
2211 return 1; 2477 return 1;
@@ -2235,13 +2501,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2235 return 1; 2501 return 1;
2236 } 2502 }
2237 2503
2238 /* Has an RCU GP gone long enough to send resched IPIs &c? */
2239 if (rcu_gp_in_progress(rsp) &&
2240 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
2241 rdp->n_rp_need_fqs++;
2242 return 1;
2243 }
2244
2245 /* nothing to do */ 2504 /* nothing to do */
2246 rdp->n_rp_need_nothing++; 2505 rdp->n_rp_need_nothing++;
2247 return 0; 2506 return 0;
@@ -2326,13 +2585,10 @@ static void rcu_barrier_func(void *type)
2326static void _rcu_barrier(struct rcu_state *rsp) 2585static void _rcu_barrier(struct rcu_state *rsp)
2327{ 2586{
2328 int cpu; 2587 int cpu;
2329 unsigned long flags;
2330 struct rcu_data *rdp; 2588 struct rcu_data *rdp;
2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); 2589 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done; 2590 unsigned long snap_done;
2334 2591
2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap); 2592 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2337 2593
2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2594 /* Take mutex to serialize concurrent rcu_barrier() requests. */
@@ -2372,70 +2628,38 @@ static void _rcu_barrier(struct rcu_state *rsp)
2372 /* 2628 /*
2373 * Initialize the count to one rather than to zero in order to 2629 * Initialize the count to one rather than to zero in order to
2374 * avoid a too-soon return to zero in case of a short grace period 2630 * avoid a too-soon return to zero in case of a short grace period
2375 * (or preemption of this task). Also flag this task as doing 2631 * (or preemption of this task). Exclude CPU-hotplug operations
2376 * an rcu_barrier(). This will prevent anyone else from adopting 2632 * to ensure that no offline CPU has callbacks queued.
2377 * orphaned callbacks, which could cause otherwise failure if a
2378 * CPU went offline and quickly came back online. To see this,
2379 * consider the following sequence of events:
2380 *
2381 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2382 * 2. CPU 1 goes offline, orphaning its callbacks.
2383 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2384 * 4. CPU 1 comes back online.
2385 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2388 */ 2633 */
2389 init_completion(&rsp->barrier_completion); 2634 init_completion(&rsp->barrier_completion);
2390 atomic_set(&rsp->barrier_cpu_count, 1); 2635 atomic_set(&rsp->barrier_cpu_count, 1);
2391 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2636 get_online_cpus();
2392 rsp->rcu_barrier_in_progress = current;
2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2394 2637
2395 /* 2638 /*
2396 * Force every CPU with callbacks to register a new callback 2639 * Force each CPU with callbacks to register a new callback.
2397 * that will tell us when all the preceding callbacks have 2640 * When that callback is invoked, we will know that all of the
2398 * been invoked. If an offline CPU has callbacks, wait for 2641 * corresponding CPU's preceding callbacks have been invoked.
2399 * it to either come back online or to finish orphaning those
2400 * callbacks.
2401 */ 2642 */
2402 for_each_possible_cpu(cpu) { 2643 for_each_possible_cpu(cpu) {
2403 preempt_disable(); 2644 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2645 continue;
2404 rdp = per_cpu_ptr(rsp->rda, cpu); 2646 rdp = per_cpu_ptr(rsp->rda, cpu);
2405 if (cpu_is_offline(cpu)) { 2647 if (is_nocb_cpu(cpu)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu, 2648 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2407 rsp->n_barrier_done); 2649 rsp->n_barrier_done);
2408 preempt_enable(); 2650 atomic_inc(&rsp->barrier_cpu_count);
2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) 2651 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2410 schedule_timeout_interruptible(1); 2652 rsp, cpu, 0);
2411 } else if (ACCESS_ONCE(rdp->qlen)) { 2653 } else if (ACCESS_ONCE(rdp->qlen)) {
2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2654 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2413 rsp->n_barrier_done); 2655 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2656 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2415 preempt_enable();
2416 } else { 2657 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 2658 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done); 2659 rsp->n_barrier_done);
2419 preempt_enable();
2420 } 2660 }
2421 } 2661 }
2422 2662 put_online_cpus();
2423 /*
2424 * Now that all online CPUs have rcu_barrier_callback() callbacks
2425 * posted, we can adopt all of the orphaned callbacks and place
2426 * an rcu_barrier_callback() callback after them. When that is done,
2427 * we are guaranteed to have an rcu_barrier_callback() callback
2428 * following every callback that could possibly have been
2429 * registered before _rcu_barrier() was called.
2430 */
2431 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2432 rcu_adopt_orphan_cbs(rsp);
2433 rsp->rcu_barrier_in_progress = NULL;
2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2435 atomic_inc(&rsp->barrier_cpu_count);
2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2439 2663
2440 /* 2664 /*
2441 * Now that we have an rcu_barrier_callback() callback on each 2665 * Now that we have an rcu_barrier_callback() callback on each
@@ -2456,8 +2680,6 @@ static void _rcu_barrier(struct rcu_state *rsp)
2456 2680
2457 /* Other rcu_barrier() invocations can now safely proceed. */ 2681 /* Other rcu_barrier() invocations can now safely proceed. */
2458 mutex_unlock(&rsp->barrier_mutex); 2682 mutex_unlock(&rsp->barrier_mutex);
2459
2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2461} 2683}
2462 2684
2463/** 2685/**
@@ -2497,8 +2719,12 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2497 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2719 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2498 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2720 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2499 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2721 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722#ifdef CONFIG_RCU_USER_QS
2723 WARN_ON_ONCE(rdp->dynticks->in_user);
2724#endif
2500 rdp->cpu = cpu; 2725 rdp->cpu = cpu;
2501 rdp->rsp = rsp; 2726 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp);
2502 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2728 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2503} 2729}
2504 2730
@@ -2516,6 +2742,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2516 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2742 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2517 struct rcu_node *rnp = rcu_get_root(rsp); 2743 struct rcu_node *rnp = rcu_get_root(rsp);
2518 2744
2745 /* Exclude new grace periods. */
2746 mutex_lock(&rsp->onoff_mutex);
2747
2519 /* Set up local state, ensuring consistent view of global state. */ 2748 /* Set up local state, ensuring consistent view of global state. */
2520 raw_spin_lock_irqsave(&rnp->lock, flags); 2749 raw_spin_lock_irqsave(&rnp->lock, flags);
2521 rdp->beenonline = 1; /* We have now been online. */ 2750 rdp->beenonline = 1; /* We have now been online. */
@@ -2523,20 +2752,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2523 rdp->qlen_last_fqs_check = 0; 2752 rdp->qlen_last_fqs_check = 0;
2524 rdp->n_force_qs_snap = rsp->n_force_qs; 2753 rdp->n_force_qs_snap = rsp->n_force_qs;
2525 rdp->blimit = blimit; 2754 rdp->blimit = blimit;
2755 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2526 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2756 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2527 atomic_set(&rdp->dynticks->dynticks, 2757 atomic_set(&rdp->dynticks->dynticks,
2528 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2758 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2529 rcu_prepare_for_idle_init(cpu); 2759 rcu_prepare_for_idle_init(cpu);
2530 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2531 2761
2532 /*
2533 * A new grace period might start here. If so, we won't be part
2534 * of it, but that is OK, as we are currently in a quiescent state.
2535 */
2536
2537 /* Exclude any attempts to start a new GP on large systems. */
2538 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
2539
2540 /* Add CPU to rcu_node bitmasks. */ 2762 /* Add CPU to rcu_node bitmasks. */
2541 rnp = rdp->mynode; 2763 rnp = rdp->mynode;
2542 mask = rdp->grpmask; 2764 mask = rdp->grpmask;
@@ -2555,14 +2777,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2555 rdp->completed = rnp->completed; 2777 rdp->completed = rnp->completed;
2556 rdp->passed_quiesce = 0; 2778 rdp->passed_quiesce = 0;
2557 rdp->qs_pending = 0; 2779 rdp->qs_pending = 0;
2558 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
2559 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2780 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
2560 } 2781 }
2561 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2782 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2562 rnp = rnp->parent; 2783 rnp = rnp->parent;
2563 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 2784 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
2785 local_irq_restore(flags);
2564 2786
2565 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2787 mutex_unlock(&rsp->onoff_mutex);
2566} 2788}
2567 2789
2568static void __cpuinit rcu_prepare_cpu(int cpu) 2790static void __cpuinit rcu_prepare_cpu(int cpu)
@@ -2584,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2584 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2806 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2585 struct rcu_node *rnp = rdp->mynode; 2807 struct rcu_node *rnp = rdp->mynode;
2586 struct rcu_state *rsp; 2808 struct rcu_state *rsp;
2809 int ret = NOTIFY_OK;
2587 2810
2588 trace_rcu_utilization("Start CPU hotplug"); 2811 trace_rcu_utilization("Start CPU hotplug");
2589 switch (action) { 2812 switch (action) {
@@ -2594,12 +2817,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2594 break; 2817 break;
2595 case CPU_ONLINE: 2818 case CPU_ONLINE:
2596 case CPU_DOWN_FAILED: 2819 case CPU_DOWN_FAILED:
2597 rcu_node_kthread_setaffinity(rnp, -1); 2820 rcu_boost_kthread_setaffinity(rnp, -1);
2598 rcu_cpu_kthread_setrt(cpu, 1);
2599 break; 2821 break;
2600 case CPU_DOWN_PREPARE: 2822 case CPU_DOWN_PREPARE:
2601 rcu_node_kthread_setaffinity(rnp, cpu); 2823 if (nocb_cpu_expendable(cpu))
2602 rcu_cpu_kthread_setrt(cpu, 0); 2824 rcu_boost_kthread_setaffinity(rnp, cpu);
2825 else
2826 ret = NOTIFY_BAD;
2603 break; 2827 break;
2604 case CPU_DYING: 2828 case CPU_DYING:
2605 case CPU_DYING_FROZEN: 2829 case CPU_DYING_FROZEN:
@@ -2623,8 +2847,31 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2623 break; 2847 break;
2624 } 2848 }
2625 trace_rcu_utilization("End CPU hotplug"); 2849 trace_rcu_utilization("End CPU hotplug");
2626 return NOTIFY_OK; 2850 return ret;
2851}
2852
2853/*
2854 * Spawn the kthread that handles this RCU flavor's grace periods.
2855 */
2856static int __init rcu_spawn_gp_kthread(void)
2857{
2858 unsigned long flags;
2859 struct rcu_node *rnp;
2860 struct rcu_state *rsp;
2861 struct task_struct *t;
2862
2863 for_each_rcu_flavor(rsp) {
2864 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
2865 BUG_ON(IS_ERR(t));
2866 rnp = rcu_get_root(rsp);
2867 raw_spin_lock_irqsave(&rnp->lock, flags);
2868 rsp->gp_kthread = t;
2869 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2870 rcu_spawn_nocb_kthreads(rsp);
2871 }
2872 return 0;
2627} 2873}
2874early_initcall(rcu_spawn_gp_kthread);
2628 2875
2629/* 2876/*
2630 * This function is invoked towards the end of the scheduler's initialization 2877 * This function is invoked towards the end of the scheduler's initialization
@@ -2661,7 +2908,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2661 int cprv; 2908 int cprv;
2662 int i; 2909 int i;
2663 2910
2664 cprv = NR_CPUS; 2911 cprv = nr_cpu_ids;
2665 for (i = rcu_num_lvls - 1; i >= 0; i--) { 2912 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2666 ccur = rsp->levelcnt[i]; 2913 ccur = rsp->levelcnt[i];
2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2914 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
@@ -2676,10 +2923,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2676static void __init rcu_init_one(struct rcu_state *rsp, 2923static void __init rcu_init_one(struct rcu_state *rsp,
2677 struct rcu_data __percpu *rda) 2924 struct rcu_data __percpu *rda)
2678{ 2925{
2679 static char *buf[] = { "rcu_node_level_0", 2926 static char *buf[] = { "rcu_node_0",
2680 "rcu_node_level_1", 2927 "rcu_node_1",
2681 "rcu_node_level_2", 2928 "rcu_node_2",
2682 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ 2929 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
2930 static char *fqs[] = { "rcu_node_fqs_0",
2931 "rcu_node_fqs_1",
2932 "rcu_node_fqs_2",
2933 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
2683 int cpustride = 1; 2934 int cpustride = 1;
2684 int i; 2935 int i;
2685 int j; 2936 int j;
@@ -2704,7 +2955,11 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2704 raw_spin_lock_init(&rnp->lock); 2955 raw_spin_lock_init(&rnp->lock);
2705 lockdep_set_class_and_name(&rnp->lock, 2956 lockdep_set_class_and_name(&rnp->lock,
2706 &rcu_node_class[i], buf[i]); 2957 &rcu_node_class[i], buf[i]);
2707 rnp->gpnum = 0; 2958 raw_spin_lock_init(&rnp->fqslock);
2959 lockdep_set_class_and_name(&rnp->fqslock,
2960 &rcu_fqs_class[i], fqs[i]);
2961 rnp->gpnum = rsp->gpnum;
2962 rnp->completed = rsp->completed;
2708 rnp->qsmask = 0; 2963 rnp->qsmask = 0;
2709 rnp->qsmaskinit = 0; 2964 rnp->qsmaskinit = 0;
2710 rnp->grplo = j * cpustride; 2965 rnp->grplo = j * cpustride;
@@ -2727,6 +2982,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2727 } 2982 }
2728 2983
2729 rsp->rda = rda; 2984 rsp->rda = rda;
2985 init_waitqueue_head(&rsp->gp_wq);
2730 rnp = rsp->level[rcu_num_lvls - 1]; 2986 rnp = rsp->level[rcu_num_lvls - 1];
2731 for_each_possible_cpu(i) { 2987 for_each_possible_cpu(i) {
2732 while (i > rnp->grphi) 2988 while (i > rnp->grphi)
@@ -2750,7 +3006,8 @@ static void __init rcu_init_geometry(void)
2750 int rcu_capacity[MAX_RCU_LVLS + 1]; 3006 int rcu_capacity[MAX_RCU_LVLS + 1];
2751 3007
2752 /* If the compile-time values are accurate, just leave. */ 3008 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) 3009 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3010 nr_cpu_ids == NR_CPUS)
2754 return; 3011 return;
2755 3012
2756 /* 3013 /*
@@ -2806,6 +3063,7 @@ void __init rcu_init(void)
2806 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3063 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2807 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3064 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2808 __rcu_init_preempt(); 3065 __rcu_init_preempt();
3066 rcu_init_nocb();
2809 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3067 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
2810 3068
2811 /* 3069 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4d29169f2124..4b69291b093d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,6 +102,10 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
105}; 109};
106 110
107/* RCU's kthread states for tracing. */ 111/* RCU's kthread states for tracing. */
@@ -196,12 +200,7 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 200 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 201 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 202#endif /* #ifdef CONFIG_RCU_BOOST */
199 struct task_struct *node_kthread_task; 203 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200 /* kthread that takes care of this rcu_node */
201 /* structure, for example, awakening the */
202 /* per-CPU kthreads as needed. */
203 unsigned int node_kthread_status;
204 /* State of node_kthread_task for tracing. */
205} ____cacheline_internodealigned_in_smp; 204} ____cacheline_internodealigned_in_smp;
206 205
207/* 206/*
@@ -245,8 +244,6 @@ struct rcu_data {
245 /* in order to detect GP end. */ 244 /* in order to detect GP end. */
246 unsigned long gpnum; /* Highest gp number that this CPU */ 245 unsigned long gpnum; /* Highest gp number that this CPU */
247 /* is aware of having started. */ 246 /* is aware of having started. */
248 unsigned long passed_quiesce_gpnum;
249 /* gpnum at time of quiescent state. */
250 bool passed_quiesce; /* User-mode/idle loop etc. */ 247 bool passed_quiesce; /* User-mode/idle loop etc. */
251 bool qs_pending; /* Core waits for quiesc state. */ 248 bool qs_pending; /* Core waits for quiesc state. */
252 bool beenonline; /* CPU online at least once. */ 249 bool beenonline; /* CPU online at least once. */
@@ -290,6 +287,7 @@ struct rcu_data {
290 long qlen_last_fqs_check; 287 long qlen_last_fqs_check;
291 /* qlen at last check for QS forcing */ 288 /* qlen at last check for QS forcing */
292 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
290 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
293 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 291 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
294 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 292 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
295 unsigned long n_force_qs_snap; 293 unsigned long n_force_qs_snap;
@@ -312,11 +310,25 @@ struct rcu_data {
312 unsigned long n_rp_cpu_needs_gp; 310 unsigned long n_rp_cpu_needs_gp;
313 unsigned long n_rp_gp_completed; 311 unsigned long n_rp_gp_completed;
314 unsigned long n_rp_gp_started; 312 unsigned long n_rp_gp_started;
315 unsigned long n_rp_need_fqs;
316 unsigned long n_rp_need_nothing; 313 unsigned long n_rp_need_nothing;
317 314
318 /* 6) _rcu_barrier() callback. */ 315 /* 6) _rcu_barrier() and OOM callbacks. */
319 struct rcu_head barrier_head; 316 struct rcu_head barrier_head;
317#ifdef CONFIG_RCU_FAST_NO_HZ
318 struct rcu_head oom_head;
319#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
320
321 /* 7) Callback offloading. */
322#ifdef CONFIG_RCU_NOCB_CPU
323 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
324 struct rcu_head **nocb_tail;
325 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
326 atomic_long_t nocb_q_count_lazy; /* (approximate). */
327 int nocb_p_count; /* # CBs being invoked by kthread */
328 int nocb_p_count_lazy; /* (approximate). */
329 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
330 struct task_struct *nocb_kthread;
331#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
320 332
321 int cpu; 333 int cpu;
322 struct rcu_state *rsp; 334 struct rcu_state *rsp;
@@ -370,26 +382,28 @@ struct rcu_state {
370 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 382 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
371 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 383 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
372 void (*func)(struct rcu_head *head)); 384 void (*func)(struct rcu_head *head));
385#ifdef CONFIG_RCU_NOCB_CPU
386 void (*call_remote)(struct rcu_head *head,
387 void (*func)(struct rcu_head *head));
388 /* call_rcu() flavor, but for */
389 /* placing on remote CPU. */
390#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
373 391
374 /* The following fields are guarded by the root rcu_node's lock. */ 392 /* The following fields are guarded by the root rcu_node's lock. */
375 393
376 u8 fqs_state ____cacheline_internodealigned_in_smp; 394 u8 fqs_state ____cacheline_internodealigned_in_smp;
377 /* Force QS state. */ 395 /* Force QS state. */
378 u8 fqs_active; /* force_quiescent_state() */
379 /* is running. */
380 u8 fqs_need_gp; /* A CPU was prevented from */
381 /* starting a new grace */
382 /* period because */
383 /* force_quiescent_state() */
384 /* was running. */
385 u8 boost; /* Subject to priority boost. */ 396 u8 boost; /* Subject to priority boost. */
386 unsigned long gpnum; /* Current gp number. */ 397 unsigned long gpnum; /* Current gp number. */
387 unsigned long completed; /* # of last completed gp. */ 398 unsigned long completed; /* # of last completed gp. */
399 struct task_struct *gp_kthread; /* Task for grace periods. */
400 wait_queue_head_t gp_wq; /* Where GP task waits. */
401 int gp_flags; /* Commands for GP task. */
388 402
389 /* End of fields guarded by root rcu_node's lock. */ 403 /* End of fields guarded by root rcu_node's lock. */
390 404
391 raw_spinlock_t onofflock; /* exclude on/offline and */ 405 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
392 /* starting new GP. */ 406 /* Protect following fields. */
393 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 407 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
394 /* need a grace period. */ 408 /* need a grace period. */
395 struct rcu_head **orphan_nxttail; /* Tail of above. */ 409 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -398,16 +412,29 @@ struct rcu_state {
398 struct rcu_head **orphan_donetail; /* Tail of above. */ 412 struct rcu_head **orphan_donetail; /* Tail of above. */
399 long qlen_lazy; /* Number of lazy callbacks. */ 413 long qlen_lazy; /* Number of lazy callbacks. */
400 long qlen; /* Total number of callbacks. */ 414 long qlen; /* Total number of callbacks. */
401 struct task_struct *rcu_barrier_in_progress; 415 /* End of fields guarded by orphan_lock. */
402 /* Task doing rcu_barrier(), */ 416
403 /* or NULL if no barrier. */ 417 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
418
404 struct mutex barrier_mutex; /* Guards barrier fields. */ 419 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 420 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */ 421 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */ 422 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */ 423 /* _rcu_barrier(). */
409 raw_spinlock_t fqslock; /* Only one task forcing */ 424 /* End of fields guarded by barrier_mutex. */
410 /* quiescent states. */ 425
426 atomic_long_t expedited_start; /* Starting ticket. */
427 atomic_long_t expedited_done; /* Done ticket. */
428 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
429 atomic_long_t expedited_tryfail; /* # acquisition failures. */
430 atomic_long_t expedited_workdone1; /* # done by others #1. */
431 atomic_long_t expedited_workdone2; /* # done by others #2. */
432 atomic_long_t expedited_normal; /* # fallbacks to normal. */
433 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
434 atomic_long_t expedited_done_tries; /* # tries to update _done. */
435 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
436 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
437
411 unsigned long jiffies_force_qs; /* Time at which to invoke */ 438 unsigned long jiffies_force_qs; /* Time at which to invoke */
412 /* force_quiescent_state(). */ 439 /* force_quiescent_state(). */
413 unsigned long n_force_qs; /* Number of calls to */ 440 unsigned long n_force_qs; /* Number of calls to */
@@ -426,7 +453,13 @@ struct rcu_state {
426 struct list_head flavors; /* List of RCU flavors. */ 453 struct list_head flavors; /* List of RCU flavors. */
427}; 454};
428 455
456/* Values for rcu_state structure's gp_flags field. */
457#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
458#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
459
429extern struct list_head rcu_struct_flavors; 460extern struct list_head rcu_struct_flavors;
461
462/* Sequence through rcu_state structures for each RCU flavor. */
430#define for_each_rcu_flavor(rsp) \ 463#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 464 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
432 465
@@ -468,7 +501,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
468#ifdef CONFIG_HOTPLUG_CPU 501#ifdef CONFIG_HOTPLUG_CPU
469static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 502static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
470 unsigned long flags); 503 unsigned long flags);
471static void rcu_stop_cpu_kthread(int cpu);
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 504#endif /* #ifdef CONFIG_HOTPLUG_CPU */
473static void rcu_print_detail_task_stall(struct rcu_state *rsp); 505static void rcu_print_detail_task_stall(struct rcu_state *rsp);
474static int rcu_print_task_stall(struct rcu_node *rnp); 506static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +523,9 @@ static void invoke_rcu_callbacks_kthread(void);
491static bool rcu_is_callbacks_kthread(void); 523static bool rcu_is_callbacks_kthread(void);
492#ifdef CONFIG_RCU_BOOST 524#ifdef CONFIG_RCU_BOOST
493static void rcu_preempt_do_callbacks(void); 525static void rcu_preempt_do_callbacks(void);
494static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
495 cpumask_var_t cm);
496static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 526static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
497 struct rcu_node *rnp, 527 struct rcu_node *rnp);
498 int rnp_index);
499static void invoke_rcu_node_kthread(struct rcu_node *rnp);
500static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
501#endif /* #ifdef CONFIG_RCU_BOOST */ 528#endif /* #ifdef CONFIG_RCU_BOOST */
502static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
503static void __cpuinit rcu_prepare_kthreads(int cpu); 529static void __cpuinit rcu_prepare_kthreads(int cpu);
504static void rcu_prepare_for_idle_init(int cpu); 530static void rcu_prepare_for_idle_init(int cpu);
505static void rcu_cleanup_after_idle(int cpu); 531static void rcu_cleanup_after_idle(int cpu);
@@ -510,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
510static void print_cpu_stall_info_end(void); 536static void print_cpu_stall_info_end(void);
511static void zero_cpu_stall_ticks(struct rcu_data *rdp); 537static void zero_cpu_stall_ticks(struct rcu_data *rdp);
512static void increment_cpu_stall_ticks(void); 538static void increment_cpu_stall_ticks(void);
539static bool is_nocb_cpu(int cpu);
540static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
541 bool lazy);
542static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
543 struct rcu_data *rdp);
544static bool nocb_cpu_expendable(int cpu);
545static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
546static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
547static void init_nocb_callback_list(struct rcu_data *rdp);
548static void __init rcu_init_nocb(void);
513 549
514#endif /* #ifndef RCU_TREE_NONCORE */ 550#endif /* #ifndef RCU_TREE_NONCORE */
551
552#ifdef CONFIG_RCU_TRACE
553#ifdef CONFIG_RCU_NOCB_CPU
554/* Sum up queue lengths for tracing. */
555static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
556{
557 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
558 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
559}
560#else /* #ifdef CONFIG_RCU_NOCB_CPU */
561static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
562{
563 *ql = 0;
564 *qll = 0;
565}
566#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
567#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c0df01..f6e5ec2932b4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,9 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h>
29#include <linux/oom.h>
30#include <linux/smpboot.h>
28 31
29#define RCU_KTHREAD_PRIO 1 32#define RCU_KTHREAD_PRIO 1
30 33
@@ -34,6 +37,14 @@
34#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
35#endif 38#endif
36 39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47
37/* 48/*
38 * Check the RCU kernel configuration parameters and print informative 49 * Check the RCU kernel configuration parameters and print informative
39 * messages about anything out of the ordinary. If you like #ifdef, you 50 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -74,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void)
74 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
75 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
76 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89 if (have_rcu_nocb_mask) {
90 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
91 cpumask_clear_cpu(0, rcu_nocb_mask);
92 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
93 }
94 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
95 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
96 if (rcu_nocb_poll)
97 pr_info("\tExperimental polled no-CBs CPUs.\n");
98 }
99#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
77} 100}
78 101
79#ifdef CONFIG_TREE_PREEMPT_RCU 102#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -118,7 +141,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
118 */ 141 */
119void rcu_force_quiescent_state(void) 142void rcu_force_quiescent_state(void)
120{ 143{
121 force_quiescent_state(&rcu_preempt_state, 0); 144 force_quiescent_state(&rcu_preempt_state);
122} 145}
123EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 146EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
124 147
@@ -136,8 +159,6 @@ static void rcu_preempt_qs(int cpu)
136{ 159{
137 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 160 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
138 161
139 rdp->passed_quiesce_gpnum = rdp->gpnum;
140 barrier();
141 if (rdp->passed_quiesce == 0) 162 if (rdp->passed_quiesce == 0)
142 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 163 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
143 rdp->passed_quiesce = 1; 164 rdp->passed_quiesce = 1;
@@ -422,9 +443,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
422 unsigned long flags; 443 unsigned long flags;
423 struct task_struct *t; 444 struct task_struct *t;
424 445
425 if (!rcu_preempt_blocked_readers_cgp(rnp))
426 return;
427 raw_spin_lock_irqsave(&rnp->lock, flags); 446 raw_spin_lock_irqsave(&rnp->lock, flags);
447 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
448 raw_spin_unlock_irqrestore(&rnp->lock, flags);
449 return;
450 }
428 t = list_entry(rnp->gp_tasks, 451 t = list_entry(rnp->gp_tasks,
429 struct task_struct, rcu_node_entry); 452 struct task_struct, rcu_node_entry);
430 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 453 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -584,17 +607,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
584 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 607 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
585 } 608 }
586 609
610 rnp->gp_tasks = NULL;
611 rnp->exp_tasks = NULL;
587#ifdef CONFIG_RCU_BOOST 612#ifdef CONFIG_RCU_BOOST
588 /* In case root is being boosted and leaf is not. */ 613 rnp->boost_tasks = NULL;
614 /*
615 * In case root is being boosted and leaf was not. Make sure
616 * that we boost the tasks blocking the current grace period
617 * in this case.
618 */
589 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 619 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
590 if (rnp_root->boost_tasks != NULL && 620 if (rnp_root->boost_tasks != NULL &&
591 rnp_root->boost_tasks != rnp_root->gp_tasks) 621 rnp_root->boost_tasks != rnp_root->gp_tasks &&
622 rnp_root->boost_tasks != rnp_root->exp_tasks)
592 rnp_root->boost_tasks = rnp_root->gp_tasks; 623 rnp_root->boost_tasks = rnp_root->gp_tasks;
593 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 624 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
594#endif /* #ifdef CONFIG_RCU_BOOST */ 625#endif /* #ifdef CONFIG_RCU_BOOST */
595 626
596 rnp->gp_tasks = NULL;
597 rnp->exp_tasks = NULL;
598 return retval; 627 return retval;
599} 628}
600 629
@@ -634,7 +663,7 @@ static void rcu_preempt_do_callbacks(void)
634 */ 663 */
635void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 664void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
636{ 665{
637 __call_rcu(head, func, &rcu_preempt_state, 0); 666 __call_rcu(head, func, &rcu_preempt_state, -1, 0);
638} 667}
639EXPORT_SYMBOL_GPL(call_rcu); 668EXPORT_SYMBOL_GPL(call_rcu);
640 669
@@ -648,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
648void kfree_call_rcu(struct rcu_head *head, 677void kfree_call_rcu(struct rcu_head *head,
649 void (*func)(struct rcu_head *rcu)) 678 void (*func)(struct rcu_head *rcu))
650{ 679{
651 __call_rcu(head, func, &rcu_preempt_state, 1); 680 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
652} 681}
653EXPORT_SYMBOL_GPL(kfree_call_rcu); 682EXPORT_SYMBOL_GPL(kfree_call_rcu);
654 683
@@ -662,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
662 * concurrently with new RCU read-side critical sections that began while 691 * concurrently with new RCU read-side critical sections that began while
663 * synchronize_rcu() was waiting. RCU read-side critical sections are 692 * synchronize_rcu() was waiting. RCU read-side critical sections are
664 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 693 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
694 *
695 * See the description of synchronize_sched() for more detailed information
696 * on memory ordering guarantees.
665 */ 697 */
666void synchronize_rcu(void) 698void synchronize_rcu(void)
667{ 699{
@@ -671,12 +703,15 @@ void synchronize_rcu(void)
671 "Illegal synchronize_rcu() in RCU read-side critical section"); 703 "Illegal synchronize_rcu() in RCU read-side critical section");
672 if (!rcu_scheduler_active) 704 if (!rcu_scheduler_active)
673 return; 705 return;
674 wait_rcu_gp(call_rcu); 706 if (rcu_expedited)
707 synchronize_rcu_expedited();
708 else
709 wait_rcu_gp(call_rcu);
675} 710}
676EXPORT_SYMBOL_GPL(synchronize_rcu); 711EXPORT_SYMBOL_GPL(synchronize_rcu);
677 712
678static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 713static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
679static long sync_rcu_preempt_exp_count; 714static unsigned long sync_rcu_preempt_exp_count;
680static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 715static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
681 716
682/* 717/*
@@ -749,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
749 * grace period for the specified rcu_node structure. If there are no such 784 * grace period for the specified rcu_node structure. If there are no such
750 * tasks, report it up the rcu_node hierarchy. 785 * tasks, report it up the rcu_node hierarchy.
751 * 786 *
752 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 787 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
788 * CPU hotplug operations.
753 */ 789 */
754static void 790static void
755sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 791sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -791,7 +827,7 @@ void synchronize_rcu_expedited(void)
791 unsigned long flags; 827 unsigned long flags;
792 struct rcu_node *rnp; 828 struct rcu_node *rnp;
793 struct rcu_state *rsp = &rcu_preempt_state; 829 struct rcu_state *rsp = &rcu_preempt_state;
794 long snap; 830 unsigned long snap;
795 int trycount = 0; 831 int trycount = 0;
796 832
797 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 833 smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -799,33 +835,47 @@ void synchronize_rcu_expedited(void)
799 smp_mb(); /* Above access cannot bleed into critical section. */ 835 smp_mb(); /* Above access cannot bleed into critical section. */
800 836
801 /* 837 /*
838 * Block CPU-hotplug operations. This means that any CPU-hotplug
839 * operation that finds an rcu_node structure with tasks in the
840 * process of being boosted will know that all tasks blocking
841 * this expedited grace period will already be in the process of
842 * being boosted. This simplifies the process of moving tasks
843 * from leaf to root rcu_node structures.
844 */
845 get_online_cpus();
846
847 /*
802 * Acquire lock, falling back to synchronize_rcu() if too many 848 * Acquire lock, falling back to synchronize_rcu() if too many
803 * lock-acquisition failures. Of course, if someone does the 849 * lock-acquisition failures. Of course, if someone does the
804 * expedited grace period for us, just leave. 850 * expedited grace period for us, just leave.
805 */ 851 */
806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 852 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
853 if (ULONG_CMP_LT(snap,
854 ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
855 put_online_cpus();
856 goto mb_ret; /* Others did our work for us. */
857 }
807 if (trycount++ < 10) { 858 if (trycount++ < 10) {
808 udelay(trycount * num_online_cpus()); 859 udelay(trycount * num_online_cpus());
809 } else { 860 } else {
810 synchronize_rcu(); 861 put_online_cpus();
862 wait_rcu_gp(call_rcu);
811 return; 863 return;
812 } 864 }
813 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
814 goto mb_ret; /* Others did our work for us. */
815 } 865 }
816 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 866 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
867 put_online_cpus();
817 goto unlock_mb_ret; /* Others did our work for us. */ 868 goto unlock_mb_ret; /* Others did our work for us. */
869 }
818 870
819 /* force all RCU readers onto ->blkd_tasks lists. */ 871 /* force all RCU readers onto ->blkd_tasks lists. */
820 synchronize_sched_expedited(); 872 synchronize_sched_expedited();
821 873
822 raw_spin_lock_irqsave(&rsp->onofflock, flags);
823
824 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 874 /* Initialize ->expmask for all non-leaf rcu_node structures. */
825 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 875 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
826 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 876 raw_spin_lock_irqsave(&rnp->lock, flags);
827 rnp->expmask = rnp->qsmaskinit; 877 rnp->expmask = rnp->qsmaskinit;
828 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 878 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 } 879 }
830 880
831 /* Snapshot current state of ->blkd_tasks lists. */ 881 /* Snapshot current state of ->blkd_tasks lists. */
@@ -834,7 +884,7 @@ void synchronize_rcu_expedited(void)
834 if (NUM_RCU_NODES > 1) 884 if (NUM_RCU_NODES > 1)
835 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 885 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
836 886
837 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 887 put_online_cpus();
838 888
839 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 889 /* Wait for snapshotted ->blkd_tasks lists to drain. */
840 rnp = rcu_get_root(rsp); 890 rnp = rcu_get_root(rsp);
@@ -853,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
853 903
854/** 904/**
855 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 905 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
906 *
907 * Note that this primitive does not necessarily wait for an RCU grace period
908 * to complete. For example, if there are no RCU callbacks queued anywhere
909 * in the system, then rcu_barrier() is within its rights to return
910 * immediately, without waiting for anything, much less an RCU grace period.
856 */ 911 */
857void rcu_barrier(void) 912void rcu_barrier(void)
858{ 913{
@@ -991,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu)
991void kfree_call_rcu(struct rcu_head *head, 1046void kfree_call_rcu(struct rcu_head *head,
992 void (*func)(struct rcu_head *rcu)) 1047 void (*func)(struct rcu_head *rcu))
993{ 1048{
994 __call_rcu(head, func, &rcu_sched_state, 1); 1049 __call_rcu(head, func, &rcu_sched_state, -1, 1);
995} 1050}
996EXPORT_SYMBOL_GPL(kfree_call_rcu); 1051EXPORT_SYMBOL_GPL(kfree_call_rcu);
997 1052
@@ -1069,6 +1124,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1069 1124
1070#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1125#endif /* #else #ifdef CONFIG_RCU_TRACE */
1071 1126
1127static void rcu_wake_cond(struct task_struct *t, int status)
1128{
1129 /*
1130 * If the thread is yielding, only wake it when this
1131 * is invoked from idle
1132 */
1133 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1134 wake_up_process(t);
1135}
1136
1072/* 1137/*
1073 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1138 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1074 * or ->boost_tasks, advancing the pointer to the next task in the 1139 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1141,17 +1206,6 @@ static int rcu_boost(struct rcu_node *rnp)
1141} 1206}
1142 1207
1143/* 1208/*
1144 * Timer handler to initiate waking up of boost kthreads that
1145 * have yielded the CPU due to excessive numbers of tasks to
1146 * boost. We wake up the per-rcu_node kthread, which in turn
1147 * will wake up the booster kthread.
1148 */
1149static void rcu_boost_kthread_timer(unsigned long arg)
1150{
1151 invoke_rcu_node_kthread((struct rcu_node *)arg);
1152}
1153
1154/*
1155 * Priority-boosting kthread. One per leaf rcu_node and one for the 1209 * Priority-boosting kthread. One per leaf rcu_node and one for the
1156 * root rcu_node. 1210 * root rcu_node.
1157 */ 1211 */
@@ -1174,8 +1228,9 @@ static int rcu_boost_kthread(void *arg)
1174 else 1228 else
1175 spincnt = 0; 1229 spincnt = 0;
1176 if (spincnt > 10) { 1230 if (spincnt > 10) {
1231 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1177 trace_rcu_utilization("End boost kthread@rcu_yield"); 1232 trace_rcu_utilization("End boost kthread@rcu_yield");
1178 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1233 schedule_timeout_interruptible(2);
1179 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1234 trace_rcu_utilization("Start boost kthread@rcu_yield");
1180 spincnt = 0; 1235 spincnt = 0;
1181 } 1236 }
@@ -1191,9 +1246,9 @@ static int rcu_boost_kthread(void *arg)
1191 * kthread to start boosting them. If there is an expedited grace 1246 * kthread to start boosting them. If there is an expedited grace
1192 * period in progress, it is always time to boost. 1247 * period in progress, it is always time to boost.
1193 * 1248 *
1194 * The caller must hold rnp->lock, which this function releases, 1249 * The caller must hold rnp->lock, which this function releases.
1195 * but irqs remain disabled. The ->boost_kthread_task is immortal, 1250 * The ->boost_kthread_task is immortal, so we don't need to worry
1196 * so we don't need to worry about it going away. 1251 * about it going away.
1197 */ 1252 */
1198static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1253static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1199{ 1254{
@@ -1213,8 +1268,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1213 rnp->boost_tasks = rnp->gp_tasks; 1268 rnp->boost_tasks = rnp->gp_tasks;
1214 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1269 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1215 t = rnp->boost_kthread_task; 1270 t = rnp->boost_kthread_task;
1216 if (t != NULL) 1271 if (t)
1217 wake_up_process(t); 1272 rcu_wake_cond(t, rnp->boost_kthread_status);
1218 } else { 1273 } else {
1219 rcu_initiate_boost_trace(rnp); 1274 rcu_initiate_boost_trace(rnp);
1220 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1275 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1286,10 @@ static void invoke_rcu_callbacks_kthread(void)
1231 local_irq_save(flags); 1286 local_irq_save(flags);
1232 __this_cpu_write(rcu_cpu_has_work, 1); 1287 __this_cpu_write(rcu_cpu_has_work, 1);
1233 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1288 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1234 current != __this_cpu_read(rcu_cpu_kthread_task)) 1289 current != __this_cpu_read(rcu_cpu_kthread_task)) {
1235 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); 1290 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1291 __this_cpu_read(rcu_cpu_kthread_status));
1292 }
1236 local_irq_restore(flags); 1293 local_irq_restore(flags);
1237} 1294}
1238 1295
@@ -1245,21 +1302,6 @@ static bool rcu_is_callbacks_kthread(void)
1245 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1302 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1246} 1303}
1247 1304
1248/*
1249 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1250 * held, so no one should be messing with the existence of the boost
1251 * kthread.
1252 */
1253static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1254 cpumask_var_t cm)
1255{
1256 struct task_struct *t;
1257
1258 t = rnp->boost_kthread_task;
1259 if (t != NULL)
1260 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1261}
1262
1263#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1305#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1264 1306
1265/* 1307/*
@@ -1276,15 +1318,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1276 * Returns zero if all is well, a negated errno otherwise. 1318 * Returns zero if all is well, a negated errno otherwise.
1277 */ 1319 */
1278static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1320static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1279 struct rcu_node *rnp, 1321 struct rcu_node *rnp)
1280 int rnp_index)
1281{ 1322{
1323 int rnp_index = rnp - &rsp->node[0];
1282 unsigned long flags; 1324 unsigned long flags;
1283 struct sched_param sp; 1325 struct sched_param sp;
1284 struct task_struct *t; 1326 struct task_struct *t;
1285 1327
1286 if (&rcu_preempt_state != rsp) 1328 if (&rcu_preempt_state != rsp)
1287 return 0; 1329 return 0;
1330
1331 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1332 return 0;
1333
1288 rsp->boost = 1; 1334 rsp->boost = 1;
1289 if (rnp->boost_kthread_task != NULL) 1335 if (rnp->boost_kthread_task != NULL)
1290 return 0; 1336 return 0;
@@ -1301,25 +1347,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1301 return 0; 1347 return 0;
1302} 1348}
1303 1349
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306/*
1307 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1308 */
1309static void rcu_stop_cpu_kthread(int cpu)
1310{
1311 struct task_struct *t;
1312
1313 /* Stop the CPU's kthread. */
1314 t = per_cpu(rcu_cpu_kthread_task, cpu);
1315 if (t != NULL) {
1316 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1317 kthread_stop(t);
1318 }
1319}
1320
1321#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1322
1323static void rcu_kthread_do_work(void) 1350static void rcu_kthread_do_work(void)
1324{ 1351{
1325 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1352 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1354,22 @@ static void rcu_kthread_do_work(void)
1327 rcu_preempt_do_callbacks(); 1354 rcu_preempt_do_callbacks();
1328} 1355}
1329 1356
1330/* 1357static void rcu_cpu_kthread_setup(unsigned int cpu)
1331 * Wake up the specified per-rcu_node-structure kthread.
1332 * Because the per-rcu_node kthreads are immortal, we don't need
1333 * to do anything to keep them alive.
1334 */
1335static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1336{
1337 struct task_struct *t;
1338
1339 t = rnp->node_kthread_task;
1340 if (t != NULL)
1341 wake_up_process(t);
1342}
1343
1344/*
1345 * Set the specified CPU's kthread to run RT or not, as specified by
1346 * the to_rt argument. The CPU-hotplug locks are held, so the task
1347 * is not going away.
1348 */
1349static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1350{ 1358{
1351 int policy;
1352 struct sched_param sp; 1359 struct sched_param sp;
1353 struct task_struct *t;
1354
1355 t = per_cpu(rcu_cpu_kthread_task, cpu);
1356 if (t == NULL)
1357 return;
1358 if (to_rt) {
1359 policy = SCHED_FIFO;
1360 sp.sched_priority = RCU_KTHREAD_PRIO;
1361 } else {
1362 policy = SCHED_NORMAL;
1363 sp.sched_priority = 0;
1364 }
1365 sched_setscheduler_nocheck(t, policy, &sp);
1366}
1367 1360
1368/* 1361 sp.sched_priority = RCU_KTHREAD_PRIO;
1369 * Timer handler to initiate the waking up of per-CPU kthreads that 1362 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1370 * have yielded the CPU due to excess numbers of RCU callbacks.
1371 * We wake up the per-rcu_node kthread, which in turn will wake up
1372 * the booster kthread.
1373 */
1374static void rcu_cpu_kthread_timer(unsigned long arg)
1375{
1376 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1377 struct rcu_node *rnp = rdp->mynode;
1378
1379 atomic_or(rdp->grpmask, &rnp->wakemask);
1380 invoke_rcu_node_kthread(rnp);
1381} 1363}
1382 1364
1383/* 1365static void rcu_cpu_kthread_park(unsigned int cpu)
1384 * Drop to non-real-time priority and yield, but only after posting a
1385 * timer that will cause us to regain our real-time priority if we
1386 * remain preempted. Either way, we restore our real-time priority
1387 * before returning.
1388 */
1389static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1390{ 1366{
1391 struct sched_param sp; 1367 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1392 struct timer_list yield_timer;
1393 int prio = current->rt_priority;
1394
1395 setup_timer_on_stack(&yield_timer, f, arg);
1396 mod_timer(&yield_timer, jiffies + 2);
1397 sp.sched_priority = 0;
1398 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1399 set_user_nice(current, 19);
1400 schedule();
1401 set_user_nice(current, 0);
1402 sp.sched_priority = prio;
1403 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1404 del_timer(&yield_timer);
1405} 1368}
1406 1369
1407/* 1370static int rcu_cpu_kthread_should_run(unsigned int cpu)
1408 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1409 * This can happen while the corresponding CPU is either coming online
1410 * or going offline. We cannot wait until the CPU is fully online
1411 * before starting the kthread, because the various notifier functions
1412 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1413 * the corresponding CPU is online.
1414 *
1415 * Return 1 if the kthread needs to stop, 0 otherwise.
1416 *
1417 * Caller must disable bh. This function can momentarily enable it.
1418 */
1419static int rcu_cpu_kthread_should_stop(int cpu)
1420{ 1371{
1421 while (cpu_is_offline(cpu) || 1372 return __get_cpu_var(rcu_cpu_has_work);
1422 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1423 smp_processor_id() != cpu) {
1424 if (kthread_should_stop())
1425 return 1;
1426 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1427 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1428 local_bh_enable();
1429 schedule_timeout_uninterruptible(1);
1430 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1431 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1432 local_bh_disable();
1433 }
1434 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1435 return 0;
1436} 1373}
1437 1374
1438/* 1375/*
@@ -1440,138 +1377,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1440 * RCU softirq used in flavors and configurations of RCU that do not 1377 * RCU softirq used in flavors and configurations of RCU that do not
1441 * support RCU priority boosting. 1378 * support RCU priority boosting.
1442 */ 1379 */
1443static int rcu_cpu_kthread(void *arg) 1380static void rcu_cpu_kthread(unsigned int cpu)
1444{ 1381{
1445 int cpu = (int)(long)arg; 1382 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
1446 unsigned long flags; 1383 char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
1447 int spincnt = 0; 1384 int spincnt;
1448 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1449 char work;
1450 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1451 1385
1452 trace_rcu_utilization("Start CPU kthread@init"); 1386 for (spincnt = 0; spincnt < 10; spincnt++) {
1453 for (;;) {
1454 *statusp = RCU_KTHREAD_WAITING;
1455 trace_rcu_utilization("End CPU kthread@rcu_wait");
1456 rcu_wait(*workp != 0 || kthread_should_stop());
1457 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1387 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1458 local_bh_disable(); 1388 local_bh_disable();
1459 if (rcu_cpu_kthread_should_stop(cpu)) {
1460 local_bh_enable();
1461 break;
1462 }
1463 *statusp = RCU_KTHREAD_RUNNING; 1389 *statusp = RCU_KTHREAD_RUNNING;
1464 per_cpu(rcu_cpu_kthread_loops, cpu)++; 1390 this_cpu_inc(rcu_cpu_kthread_loops);
1465 local_irq_save(flags); 1391 local_irq_disable();
1466 work = *workp; 1392 work = *workp;
1467 *workp = 0; 1393 *workp = 0;
1468 local_irq_restore(flags); 1394 local_irq_enable();
1469 if (work) 1395 if (work)
1470 rcu_kthread_do_work(); 1396 rcu_kthread_do_work();
1471 local_bh_enable(); 1397 local_bh_enable();
1472 if (*workp != 0) 1398 if (*workp == 0) {
1473 spincnt++; 1399 trace_rcu_utilization("End CPU kthread@rcu_wait");
1474 else 1400 *statusp = RCU_KTHREAD_WAITING;
1475 spincnt = 0; 1401 return;
1476 if (spincnt > 10) {
1477 *statusp = RCU_KTHREAD_YIELDING;
1478 trace_rcu_utilization("End CPU kthread@rcu_yield");
1479 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1480 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1481 spincnt = 0;
1482 }
1483 }
1484 *statusp = RCU_KTHREAD_STOPPED;
1485 trace_rcu_utilization("End CPU kthread@term");
1486 return 0;
1487}
1488
1489/*
1490 * Spawn a per-CPU kthread, setting up affinity and priority.
1491 * Because the CPU hotplug lock is held, no other CPU will be attempting
1492 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1493 * attempting to access it during boot, but the locking in kthread_bind()
1494 * will enforce sufficient ordering.
1495 *
1496 * Please note that we cannot simply refuse to wake up the per-CPU
1497 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1498 * which can result in softlockup complaints if the task ends up being
1499 * idle for more than a couple of minutes.
1500 *
1501 * However, please note also that we cannot bind the per-CPU kthread to its
1502 * CPU until that CPU is fully online. We also cannot wait until the
1503 * CPU is fully online before we create its per-CPU kthread, as this would
1504 * deadlock the system when CPU notifiers tried waiting for grace
1505 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1506 * is online. If its CPU is not yet fully online, then the code in
1507 * rcu_cpu_kthread() will wait until it is fully online, and then do
1508 * the binding.
1509 */
1510static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1511{
1512 struct sched_param sp;
1513 struct task_struct *t;
1514
1515 if (!rcu_scheduler_fully_active ||
1516 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1517 return 0;
1518 t = kthread_create_on_node(rcu_cpu_kthread,
1519 (void *)(long)cpu,
1520 cpu_to_node(cpu),
1521 "rcuc/%d", cpu);
1522 if (IS_ERR(t))
1523 return PTR_ERR(t);
1524 if (cpu_online(cpu))
1525 kthread_bind(t, cpu);
1526 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1527 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1528 sp.sched_priority = RCU_KTHREAD_PRIO;
1529 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1530 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1531 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1532 return 0;
1533}
1534
1535/*
1536 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1537 * kthreads when needed. We ignore requests to wake up kthreads
1538 * for offline CPUs, which is OK because force_quiescent_state()
1539 * takes care of this case.
1540 */
1541static int rcu_node_kthread(void *arg)
1542{
1543 int cpu;
1544 unsigned long flags;
1545 unsigned long mask;
1546 struct rcu_node *rnp = (struct rcu_node *)arg;
1547 struct sched_param sp;
1548 struct task_struct *t;
1549
1550 for (;;) {
1551 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1552 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1553 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1554 raw_spin_lock_irqsave(&rnp->lock, flags);
1555 mask = atomic_xchg(&rnp->wakemask, 0);
1556 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1557 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1558 if ((mask & 0x1) == 0)
1559 continue;
1560 preempt_disable();
1561 t = per_cpu(rcu_cpu_kthread_task, cpu);
1562 if (!cpu_online(cpu) || t == NULL) {
1563 preempt_enable();
1564 continue;
1565 }
1566 per_cpu(rcu_cpu_has_work, cpu) = 1;
1567 sp.sched_priority = RCU_KTHREAD_PRIO;
1568 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1569 preempt_enable();
1570 } 1402 }
1571 } 1403 }
1572 /* NOTREACHED */ 1404 *statusp = RCU_KTHREAD_YIELDING;
1573 rnp->node_kthread_status = RCU_KTHREAD_STOPPED; 1405 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1574 return 0; 1406 schedule_timeout_interruptible(2);
1407 trace_rcu_utilization("End CPU kthread@rcu_yield");
1408 *statusp = RCU_KTHREAD_WAITING;
1575} 1409}
1576 1410
1577/* 1411/*
@@ -1583,17 +1417,17 @@ static int rcu_node_kthread(void *arg)
1583 * no outgoing CPU. If there are no CPUs left in the affinity set, 1417 * no outgoing CPU. If there are no CPUs left in the affinity set,
1584 * this function allows the kthread to execute on any CPU. 1418 * this function allows the kthread to execute on any CPU.
1585 */ 1419 */
1586static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1420static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1587{ 1421{
1422 struct task_struct *t = rnp->boost_kthread_task;
1423 unsigned long mask = rnp->qsmaskinit;
1588 cpumask_var_t cm; 1424 cpumask_var_t cm;
1589 int cpu; 1425 int cpu;
1590 unsigned long mask = rnp->qsmaskinit;
1591 1426
1592 if (rnp->node_kthread_task == NULL) 1427 if (!t)
1593 return; 1428 return;
1594 if (!alloc_cpumask_var(&cm, GFP_KERNEL)) 1429 if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1595 return; 1430 return;
1596 cpumask_clear(cm);
1597 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1431 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1598 if ((mask & 0x1) && cpu != outgoingcpu) 1432 if ((mask & 0x1) && cpu != outgoingcpu)
1599 cpumask_set_cpu(cpu, cm); 1433 cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1437,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1603 cpumask_clear_cpu(cpu, cm); 1437 cpumask_clear_cpu(cpu, cm);
1604 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1438 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1605 } 1439 }
1606 set_cpus_allowed_ptr(rnp->node_kthread_task, cm); 1440 set_cpus_allowed_ptr(t, cm);
1607 rcu_boost_kthread_setaffinity(rnp, cm);
1608 free_cpumask_var(cm); 1441 free_cpumask_var(cm);
1609} 1442}
1610 1443
1611/* 1444static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1612 * Spawn a per-rcu_node kthread, setting priority and affinity. 1445 .store = &rcu_cpu_kthread_task,
1613 * Called during boot before online/offline can happen, or, if 1446 .thread_should_run = rcu_cpu_kthread_should_run,
1614 * during runtime, with the main CPU-hotplug locks held. So only 1447 .thread_fn = rcu_cpu_kthread,
1615 * one of these can be executing at a time. 1448 .thread_comm = "rcuc/%u",
1616 */ 1449 .setup = rcu_cpu_kthread_setup,
1617static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, 1450 .park = rcu_cpu_kthread_park,
1618 struct rcu_node *rnp) 1451};
1619{
1620 unsigned long flags;
1621 int rnp_index = rnp - &rsp->node[0];
1622 struct sched_param sp;
1623 struct task_struct *t;
1624
1625 if (!rcu_scheduler_fully_active ||
1626 rnp->qsmaskinit == 0)
1627 return 0;
1628 if (rnp->node_kthread_task == NULL) {
1629 t = kthread_create(rcu_node_kthread, (void *)rnp,
1630 "rcun/%d", rnp_index);
1631 if (IS_ERR(t))
1632 return PTR_ERR(t);
1633 raw_spin_lock_irqsave(&rnp->lock, flags);
1634 rnp->node_kthread_task = t;
1635 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1636 sp.sched_priority = 99;
1637 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1638 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1639 }
1640 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1641}
1642 1452
1643/* 1453/*
1644 * Spawn all kthreads -- called as soon as the scheduler is running. 1454 * Spawn all kthreads -- called as soon as the scheduler is running.
1645 */ 1455 */
1646static int __init rcu_spawn_kthreads(void) 1456static int __init rcu_spawn_kthreads(void)
1647{ 1457{
1648 int cpu;
1649 struct rcu_node *rnp; 1458 struct rcu_node *rnp;
1459 int cpu;
1650 1460
1651 rcu_scheduler_fully_active = 1; 1461 rcu_scheduler_fully_active = 1;
1652 for_each_possible_cpu(cpu) { 1462 for_each_possible_cpu(cpu)
1653 per_cpu(rcu_cpu_has_work, cpu) = 0; 1463 per_cpu(rcu_cpu_has_work, cpu) = 0;
1654 if (cpu_online(cpu)) 1464 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1655 (void)rcu_spawn_one_cpu_kthread(cpu);
1656 }
1657 rnp = rcu_get_root(rcu_state); 1465 rnp = rcu_get_root(rcu_state);
1658 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1466 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1659 if (NUM_RCU_NODES > 1) { 1467 if (NUM_RCU_NODES > 1) {
1660 rcu_for_each_leaf_node(rcu_state, rnp) 1468 rcu_for_each_leaf_node(rcu_state, rnp)
1661 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1469 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1662 } 1470 }
1663 return 0; 1471 return 0;
1664} 1472}
@@ -1670,11 +1478,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1670 struct rcu_node *rnp = rdp->mynode; 1478 struct rcu_node *rnp = rdp->mynode;
1671 1479
1672 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1480 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1673 if (rcu_scheduler_fully_active) { 1481 if (rcu_scheduler_fully_active)
1674 (void)rcu_spawn_one_cpu_kthread(cpu); 1482 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1675 if (rnp->node_kthread_task == NULL)
1676 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1677 }
1678} 1483}
1679 1484
1680#else /* #ifdef CONFIG_RCU_BOOST */ 1485#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1503,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1698{ 1503{
1699} 1504}
1700 1505
1701#ifdef CONFIG_HOTPLUG_CPU 1506static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1702
1703static void rcu_stop_cpu_kthread(int cpu)
1704{
1705}
1706
1707#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1708
1709static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1710{
1711}
1712
1713static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1714{ 1507{
1715} 1508}
1716 1509
@@ -1997,6 +1790,26 @@ static void rcu_prepare_for_idle(int cpu)
1997 if (!tne) 1790 if (!tne)
1998 return; 1791 return;
1999 1792
1793 /* Adaptive-tick mode, where usermode execution is idle to RCU. */
1794 if (!is_idle_task(current)) {
1795 rdtp->dyntick_holdoff = jiffies - 1;
1796 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1797 trace_rcu_prep_idle("User dyntick with callbacks");
1798 rdtp->idle_gp_timer_expires =
1799 round_up(jiffies + RCU_IDLE_GP_DELAY,
1800 RCU_IDLE_GP_DELAY);
1801 } else if (rcu_cpu_has_callbacks(cpu)) {
1802 rdtp->idle_gp_timer_expires =
1803 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1804 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1805 } else {
1806 return;
1807 }
1808 tp = &rdtp->idle_gp_timer;
1809 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1810 return;
1811 }
1812
2000 /* 1813 /*
2001 * If this is an idle re-entry, for example, due to use of 1814 * If this is an idle re-entry, for example, due to use of
2002 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1815 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
@@ -2075,16 +1888,16 @@ static void rcu_prepare_for_idle(int cpu)
2075#ifdef CONFIG_TREE_PREEMPT_RCU 1888#ifdef CONFIG_TREE_PREEMPT_RCU
2076 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 1889 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2077 rcu_preempt_qs(cpu); 1890 rcu_preempt_qs(cpu);
2078 force_quiescent_state(&rcu_preempt_state, 0); 1891 force_quiescent_state(&rcu_preempt_state);
2079 } 1892 }
2080#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1893#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2081 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 1894 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2082 rcu_sched_qs(cpu); 1895 rcu_sched_qs(cpu);
2083 force_quiescent_state(&rcu_sched_state, 0); 1896 force_quiescent_state(&rcu_sched_state);
2084 } 1897 }
2085 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 1898 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2086 rcu_bh_qs(cpu); 1899 rcu_bh_qs(cpu);
2087 force_quiescent_state(&rcu_bh_state, 0); 1900 force_quiescent_state(&rcu_bh_state);
2088 } 1901 }
2089 1902
2090 /* 1903 /*
@@ -2112,6 +1925,88 @@ static void rcu_idle_count_callbacks_posted(void)
2112 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); 1925 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2113} 1926}
2114 1927
1928/*
1929 * Data for flushing lazy RCU callbacks at OOM time.
1930 */
1931static atomic_t oom_callback_count;
1932static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1933
1934/*
1935 * RCU OOM callback -- decrement the outstanding count and deliver the
1936 * wake-up if we are the last one.
1937 */
1938static void rcu_oom_callback(struct rcu_head *rhp)
1939{
1940 if (atomic_dec_and_test(&oom_callback_count))
1941 wake_up(&oom_callback_wq);
1942}
1943
1944/*
1945 * Post an rcu_oom_notify callback on the current CPU if it has at
1946 * least one lazy callback. This will unnecessarily post callbacks
1947 * to CPUs that already have a non-lazy callback at the end of their
1948 * callback list, but this is an infrequent operation, so accept some
1949 * extra overhead to keep things simple.
1950 */
1951static void rcu_oom_notify_cpu(void *unused)
1952{
1953 struct rcu_state *rsp;
1954 struct rcu_data *rdp;
1955
1956 for_each_rcu_flavor(rsp) {
1957 rdp = __this_cpu_ptr(rsp->rda);
1958 if (rdp->qlen_lazy != 0) {
1959 atomic_inc(&oom_callback_count);
1960 rsp->call(&rdp->oom_head, rcu_oom_callback);
1961 }
1962 }
1963}
1964
1965/*
1966 * If low on memory, ensure that each CPU has a non-lazy callback.
1967 * This will wake up CPUs that have only lazy callbacks, in turn
1968 * ensuring that they free up the corresponding memory in a timely manner.
1969 * Because an uncertain amount of memory will be freed in some uncertain
1970 * timeframe, we do not claim to have freed anything.
1971 */
1972static int rcu_oom_notify(struct notifier_block *self,
1973 unsigned long notused, void *nfreed)
1974{
1975 int cpu;
1976
1977 /* Wait for callbacks from earlier instance to complete. */
1978 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1979
1980 /*
1981 * Prevent premature wakeup: ensure that all increments happen
1982 * before there is a chance of the counter reaching zero.
1983 */
1984 atomic_set(&oom_callback_count, 1);
1985
1986 get_online_cpus();
1987 for_each_online_cpu(cpu) {
1988 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1989 cond_resched();
1990 }
1991 put_online_cpus();
1992
1993 /* Unconditionally decrement: no need to wake ourselves up. */
1994 atomic_dec(&oom_callback_count);
1995
1996 return NOTIFY_OK;
1997}
1998
1999static struct notifier_block rcu_oom_nb = {
2000 .notifier_call = rcu_oom_notify
2001};
2002
2003static int __init rcu_register_oom_notifier(void)
2004{
2005 register_oom_notifier(&rcu_oom_nb);
2006 return 0;
2007}
2008early_initcall(rcu_register_oom_notifier);
2009
2115#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2116 2011
2117#ifdef CONFIG_RCU_CPU_STALL_INFO 2012#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2122,11 +2017,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2122{ 2017{
2123 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 2018 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2124 struct timer_list *tltp = &rdtp->idle_gp_timer; 2019 struct timer_list *tltp = &rdtp->idle_gp_timer;
2020 char c;
2125 2021
2126 sprintf(cp, "drain=%d %c timer=%lu", 2022 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
2127 rdtp->dyntick_drain, 2023 if (timer_pending(tltp))
2128 rdtp->dyntick_holdoff == jiffies ? 'H' : '.', 2024 sprintf(cp, "drain=%d %c timer=%lu",
2129 timer_pending(tltp) ? tltp->expires - jiffies : -1); 2025 rdtp->dyntick_drain, c, tltp->expires - jiffies);
2026 else
2027 sprintf(cp, "drain=%d %c timer not pending",
2028 rdtp->dyntick_drain, c);
2130} 2029}
2131 2030
2132#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2031#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2194,11 +2093,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2194/* Increment ->ticks_this_gp for all flavors of RCU. */ 2093/* Increment ->ticks_this_gp for all flavors of RCU. */
2195static void increment_cpu_stall_ticks(void) 2094static void increment_cpu_stall_ticks(void)
2196{ 2095{
2197 __get_cpu_var(rcu_sched_data).ticks_this_gp++; 2096 struct rcu_state *rsp;
2198 __get_cpu_var(rcu_bh_data).ticks_this_gp++; 2097
2199#ifdef CONFIG_TREE_PREEMPT_RCU 2098 for_each_rcu_flavor(rsp)
2200 __get_cpu_var(rcu_preempt_data).ticks_this_gp++; 2099 __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
2201#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2202} 2100}
2203 2101
2204#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2102#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2227,3 +2125,373 @@ static void increment_cpu_stall_ticks(void)
2227} 2125}
2228 2126
2229#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2127#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2128
2129#ifdef CONFIG_RCU_NOCB_CPU
2130
2131/*
2132 * Offload callback processing from the boot-time-specified set of CPUs
2133 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2134 * kthread created that pulls the callbacks from the corresponding CPU,
2135 * waits for a grace period to elapse, and invokes the callbacks.
2136 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2137 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2138 * has been specified, in which case each kthread actively polls its
2139 * CPU. (Which isn't so great for energy efficiency, but which does
2140 * reduce RCU's overhead on that CPU.)
2141 *
2142 * This is intended to be used in conjunction with Frederic Weisbecker's
2143 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2144 * running CPU-bound user-mode computations.
2145 *
2146 * Offloading of callback processing could also in theory be used as
2147 * an energy-efficiency measure because CPUs with no RCU callbacks
2148 * queued are more aggressive about entering dyntick-idle mode.
2149 */
2150
2151
2152/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2153static int __init rcu_nocb_setup(char *str)
2154{
2155 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2156 have_rcu_nocb_mask = true;
2157 cpulist_parse(str, rcu_nocb_mask);
2158 return 1;
2159}
2160__setup("rcu_nocbs=", rcu_nocb_setup);
2161
2162/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu)
2164{
2165 if (have_rcu_nocb_mask)
2166 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2167 return false;
2168}
2169
2170/*
2171 * Enqueue the specified string of rcu_head structures onto the specified
2172 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2173 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2174 * counts are supplied by rhcount and rhcount_lazy.
2175 *
2176 * If warranted, also wake up the kthread servicing this CPUs queues.
2177 */
2178static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2179 struct rcu_head *rhp,
2180 struct rcu_head **rhtp,
2181 int rhcount, int rhcount_lazy)
2182{
2183 int len;
2184 struct rcu_head **old_rhpp;
2185 struct task_struct *t;
2186
2187 /* Enqueue the callback on the nocb list and update counts. */
2188 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2189 ACCESS_ONCE(*old_rhpp) = rhp;
2190 atomic_long_add(rhcount, &rdp->nocb_q_count);
2191 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2192
2193 /* If we are not being polled and there is a kthread, awaken it ... */
2194 t = ACCESS_ONCE(rdp->nocb_kthread);
2195 if (rcu_nocb_poll | !t)
2196 return;
2197 len = atomic_long_read(&rdp->nocb_q_count);
2198 if (old_rhpp == &rdp->nocb_head) {
2199 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2200 rdp->qlen_last_fqs_check = 0;
2201 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2202 wake_up_process(t); /* ... or if many callbacks queued. */
2203 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2204 }
2205 return;
2206}
2207
2208/*
2209 * This is a helper for __call_rcu(), which invokes this when the normal
2210 * callback queue is inoperable. If this is not a no-CBs CPU, this
2211 * function returns failure back to __call_rcu(), which can complain
2212 * appropriately.
2213 *
2214 * Otherwise, this function queues the callback where the corresponding
2215 * "rcuo" kthread can find it.
2216 */
2217static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2218 bool lazy)
2219{
2220
2221 if (!is_nocb_cpu(rdp->cpu))
2222 return 0;
2223 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2224 return 1;
2225}
2226
2227/*
2228 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2229 * not a no-CBs CPU.
2230 */
2231static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2232 struct rcu_data *rdp)
2233{
2234 long ql = rsp->qlen;
2235 long qll = rsp->qlen_lazy;
2236
2237 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2238 if (!is_nocb_cpu(smp_processor_id()))
2239 return 0;
2240 rsp->qlen = 0;
2241 rsp->qlen_lazy = 0;
2242
2243 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2244 if (rsp->orphan_donelist != NULL) {
2245 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2246 rsp->orphan_donetail, ql, qll);
2247 ql = qll = 0;
2248 rsp->orphan_donelist = NULL;
2249 rsp->orphan_donetail = &rsp->orphan_donelist;
2250 }
2251 if (rsp->orphan_nxtlist != NULL) {
2252 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2253 rsp->orphan_nxttail, ql, qll);
2254 ql = qll = 0;
2255 rsp->orphan_nxtlist = NULL;
2256 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2257 }
2258 return 1;
2259}
2260
2261/*
2262 * There must be at least one non-no-CBs CPU in operation at any given
2263 * time, because no-CBs CPUs are not capable of initiating grace periods
2264 * independently. This function therefore complains if the specified
2265 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2266 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2267 * but you have to have a base case!)
2268 */
2269static bool nocb_cpu_expendable(int cpu)
2270{
2271 cpumask_var_t non_nocb_cpus;
2272 int ret;
2273
2274 /*
2275 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2276 * then offlining this CPU is harmless. Let it happen.
2277 */
2278 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2279 return 1;
2280
2281 /* If no memory, play it safe and keep the CPU around. */
2282 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2283 return 0;
2284 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2285 cpumask_clear_cpu(cpu, non_nocb_cpus);
2286 ret = !cpumask_empty(non_nocb_cpus);
2287 free_cpumask_var(non_nocb_cpus);
2288 return ret;
2289}
2290
2291/*
2292 * Helper structure for remote registry of RCU callbacks.
2293 * This is needed for when a no-CBs CPU needs to start a grace period.
2294 * If it just invokes call_rcu(), the resulting callback will be queued,
2295 * which can result in deadlock.
2296 */
2297struct rcu_head_remote {
2298 struct rcu_head *rhp;
2299 call_rcu_func_t *crf;
2300 void (*func)(struct rcu_head *rhp);
2301};
2302
2303/*
2304 * Register a callback as specified by the rcu_head_remote struct.
2305 * This function is intended to be invoked via smp_call_function_single().
2306 */
2307static void call_rcu_local(void *arg)
2308{
2309 struct rcu_head_remote *rhrp =
2310 container_of(arg, struct rcu_head_remote, rhp);
2311
2312 rhrp->crf(rhrp->rhp, rhrp->func);
2313}
2314
2315/*
2316 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2317 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2318 * smp_call_function_single().
2319 */
2320static void invoke_crf_remote(struct rcu_head *rhp,
2321 void (*func)(struct rcu_head *rhp),
2322 call_rcu_func_t crf)
2323{
2324 struct rcu_head_remote rhr;
2325
2326 rhr.rhp = rhp;
2327 rhr.crf = crf;
2328 rhr.func = func;
2329 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2330}
2331
2332/*
2333 * Helper functions to be passed to wait_rcu_gp(), each of which
2334 * invokes invoke_crf_remote() to register a callback appropriately.
2335 */
2336static void __maybe_unused
2337call_rcu_preempt_remote(struct rcu_head *rhp,
2338 void (*func)(struct rcu_head *rhp))
2339{
2340 invoke_crf_remote(rhp, func, call_rcu);
2341}
2342static void call_rcu_bh_remote(struct rcu_head *rhp,
2343 void (*func)(struct rcu_head *rhp))
2344{
2345 invoke_crf_remote(rhp, func, call_rcu_bh);
2346}
2347static void call_rcu_sched_remote(struct rcu_head *rhp,
2348 void (*func)(struct rcu_head *rhp))
2349{
2350 invoke_crf_remote(rhp, func, call_rcu_sched);
2351}
2352
2353/*
2354 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2355 * callbacks queued by the corresponding no-CBs CPU.
2356 */
2357static int rcu_nocb_kthread(void *arg)
2358{
2359 int c, cl;
2360 struct rcu_head *list;
2361 struct rcu_head *next;
2362 struct rcu_head **tail;
2363 struct rcu_data *rdp = arg;
2364
2365 /* Each pass through this loop invokes one batch of callbacks */
2366 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) {
2372 schedule_timeout_interruptible(1);
2373 continue;
2374 }
2375
2376 /*
2377 * Extract queued callbacks, update counts, and wait
2378 * for a grace period to elapse.
2379 */
2380 ACCESS_ONCE(rdp->nocb_head) = NULL;
2381 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2382 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2383 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2384 ACCESS_ONCE(rdp->nocb_p_count) += c;
2385 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2386 wait_rcu_gp(rdp->rsp->call_remote);
2387
2388 /* Each pass through the following loop invokes a callback. */
2389 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2390 c = cl = 0;
2391 while (list) {
2392 next = list->next;
2393 /* Wait for enqueuing to complete, if needed. */
2394 while (next == NULL && &list->next != tail) {
2395 schedule_timeout_interruptible(1);
2396 next = list->next;
2397 }
2398 debug_rcu_head_unqueue(list);
2399 local_bh_disable();
2400 if (__rcu_reclaim(rdp->rsp->name, list))
2401 cl++;
2402 c++;
2403 local_bh_enable();
2404 list = next;
2405 }
2406 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2407 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2408 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2409 rdp->n_nocbs_invoked += c;
2410 }
2411 return 0;
2412}
2413
2414/* Initialize per-rcu_data variables for no-CBs CPUs. */
2415static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2416{
2417 rdp->nocb_tail = &rdp->nocb_head;
2418 init_waitqueue_head(&rdp->nocb_wq);
2419}
2420
2421/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2422static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2423{
2424 int cpu;
2425 struct rcu_data *rdp;
2426 struct task_struct *t;
2427
2428 if (rcu_nocb_mask == NULL)
2429 return;
2430 for_each_cpu(cpu, rcu_nocb_mask) {
2431 rdp = per_cpu_ptr(rsp->rda, cpu);
2432 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2433 BUG_ON(IS_ERR(t));
2434 ACCESS_ONCE(rdp->nocb_kthread) = t;
2435 }
2436}
2437
2438/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2439static void init_nocb_callback_list(struct rcu_data *rdp)
2440{
2441 if (rcu_nocb_mask == NULL ||
2442 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2443 return;
2444 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2445}
2446
2447/* Initialize the ->call_remote fields in the rcu_state structures. */
2448static void __init rcu_init_nocb(void)
2449{
2450#ifdef CONFIG_PREEMPT_RCU
2451 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2452#endif /* #ifdef CONFIG_PREEMPT_RCU */
2453 rcu_bh_state.call_remote = call_rcu_bh_remote;
2454 rcu_sched_state.call_remote = call_rcu_sched_remote;
2455}
2456
2457#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2458
2459static bool is_nocb_cpu(int cpu)
2460{
2461 return false;
2462}
2463
2464static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2465 bool lazy)
2466{
2467 return 0;
2468}
2469
2470static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2471 struct rcu_data *rdp)
2472{
2473 return 0;
2474}
2475
2476static bool nocb_cpu_expendable(int cpu)
2477{
2478 return 1;
2479}
2480
2481static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2482{
2483}
2484
2485static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2486{
2487}
2488
2489static void init_nocb_callback_list(struct rcu_data *rdp)
2490{
2491}
2492
2493static void __init rcu_init_nocb(void)
2494{
2495}
2496
2497#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index abffb486e94e..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", 72{
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -84,22 +113,26 @@ static char convert_kthread_status(unsigned int kthread_status)
84 113
85static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
116 long ql, qll;
117
87 if (!rdp->beenonline) 118 if (!rdp->beenonline)
88 return; 119 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 121 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 124 rdp->passed_quiesce, rdp->qs_pending);
94 rdp->qs_pending);
95 seq_printf(m, " dt=%d/%llx/%d df=%lu", 125 seq_printf(m, " dt=%d/%llx/%d df=%lu",
96 atomic_read(&rdp->dynticks->dynticks), 126 atomic_read(&rdp->dynticks->dynticks),
97 rdp->dynticks->dynticks_nesting, 127 rdp->dynticks->dynticks_nesting,
98 rdp->dynticks->dynticks_nmi_nesting, 128 rdp->dynticks->dynticks_nmi_nesting,
99 rdp->dynticks_fqs); 129 rdp->dynticks_fqs);
100 seq_printf(m, " of=%lu", rdp->offline_fqs); 130 seq_printf(m, " of=%lu", rdp->offline_fqs);
131 rcu_nocb_q_lengths(rdp, &ql, &qll);
132 qll += rdp->qlen_lazy;
133 ql += rdp->qlen;
101 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
102 rdp->qlen_lazy, rdp->qlen, 135 qll, ql,
103 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
104 rdp->nxttail[RCU_NEXT_TAIL]], 137 rdp->nxttail[RCU_NEXT_TAIL]],
105 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -108,110 +141,74 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
108 rdp->nxttail[RCU_WAIT_TAIL]], 141 rdp->nxttail[RCU_WAIT_TAIL]],
109 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); 142 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
110#ifdef CONFIG_RCU_BOOST 143#ifdef CONFIG_RCU_BOOST
111 seq_printf(m, " kt=%d/%c/%d ktl=%x", 144 seq_printf(m, " kt=%d/%c ktl=%x",
112 per_cpu(rcu_cpu_has_work, rdp->cpu), 145 per_cpu(rcu_cpu_has_work, rdp->cpu),
113 convert_kthread_status(per_cpu(rcu_cpu_kthread_status, 146 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
114 rdp->cpu)), 147 rdp->cpu)),
115 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
116 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
117#endif /* #ifdef CONFIG_RCU_BOOST */ 149#endif /* #ifdef CONFIG_RCU_BOOST */
118 seq_printf(m, " b=%ld", rdp->blimit); 150 seq_printf(m, " b=%ld", rdp->blimit);
119 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
120 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
121} 154}
122 155
123static int show_rcudata(struct seq_file *m, void *unused) 156static int show_rcudata(struct seq_file *m, void *v)
124{ 157{
125 int cpu; 158 print_one_rcu_data(m, (struct rcu_data *)v);
126 struct rcu_state *rsp;
127
128 for_each_rcu_flavor(rsp) {
129 seq_printf(m, "%s:\n", rsp->name);
130 for_each_possible_cpu(cpu)
131 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
132 }
133 return 0; 159 return 0;
134} 160}
135 161
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
136static int rcudata_open(struct inode *inode, struct file *file) 169static int rcudata_open(struct inode *inode, struct file *file)
137{ 170{
138 return single_open(file, show_rcudata, NULL); 171 return r_open(inode, file, &rcudate_op);
139} 172}
140 173
141static const struct file_operations rcudata_fops = { 174static const struct file_operations rcudata_fops = {
142 .owner = THIS_MODULE, 175 .owner = THIS_MODULE,
143 .open = rcudata_open, 176 .open = rcudata_open,
144 .read = seq_read, 177 .read = seq_read,
145 .llseek = seq_lseek, 178 .llseek = no_llseek,
146 .release = single_release, 179 .release = seq_release,
147}; 180};
148 181
149static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 182static int show_rcuexp(struct seq_file *m, void *v)
150{ 183{
151 if (!rdp->beenonline) 184 struct rcu_state *rsp = (struct rcu_state *)m->private;
152 return; 185
153 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", 186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
154 rdp->cpu, 187 atomic_long_read(&rsp->expedited_start),
155 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 188 atomic_long_read(&rsp->expedited_done),
156 rdp->completed, rdp->gpnum, 189 atomic_long_read(&rsp->expedited_wrap),
157 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 190 atomic_long_read(&rsp->expedited_tryfail),
158 rdp->qs_pending); 191 atomic_long_read(&rsp->expedited_workdone1),
159 seq_printf(m, ",%d,%llx,%d,%lu", 192 atomic_long_read(&rsp->expedited_workdone2),
160 atomic_read(&rdp->dynticks->dynticks), 193 atomic_long_read(&rsp->expedited_normal),
161 rdp->dynticks->dynticks_nesting, 194 atomic_long_read(&rsp->expedited_stoppedcpus),
162 rdp->dynticks->dynticks_nmi_nesting, 195 atomic_long_read(&rsp->expedited_done_tries),
163 rdp->dynticks_fqs); 196 atomic_long_read(&rsp->expedited_done_lost),
164 seq_printf(m, ",%lu", rdp->offline_fqs); 197 atomic_long_read(&rsp->expedited_done_exit));
165 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
166 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
167 rdp->nxttail[RCU_NEXT_TAIL]],
168 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
169 rdp->nxttail[RCU_NEXT_READY_TAIL]],
170 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
171 rdp->nxttail[RCU_WAIT_TAIL]],
172 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
173#ifdef CONFIG_RCU_BOOST
174 seq_printf(m, ",%d,\"%c\"",
175 per_cpu(rcu_cpu_has_work, rdp->cpu),
176 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
177 rdp->cpu)));
178#endif /* #ifdef CONFIG_RCU_BOOST */
179 seq_printf(m, ",%ld", rdp->blimit);
180 seq_printf(m, ",%lu,%lu,%lu\n",
181 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
182}
183
184static int show_rcudata_csv(struct seq_file *m, void *unused)
185{
186 int cpu;
187 struct rcu_state *rsp;
188
189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
192#ifdef CONFIG_RCU_BOOST
193 seq_puts(m, "\"kt\",\"ktl\"");
194#endif /* #ifdef CONFIG_RCU_BOOST */
195 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
196 for_each_rcu_flavor(rsp) {
197 seq_printf(m, "\"%s:\"\n", rsp->name);
198 for_each_possible_cpu(cpu)
199 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
200 }
201 return 0; 198 return 0;
202} 199}
203 200
204static int rcudata_csv_open(struct inode *inode, struct file *file) 201static int rcuexp_open(struct inode *inode, struct file *file)
205{ 202{
206 return single_open(file, show_rcudata_csv, NULL); 203 return single_open(file, show_rcuexp, inode->i_private);
207} 204}
208 205
209static const struct file_operations rcudata_csv_fops = { 206static const struct file_operations rcuexp_fops = {
210 .owner = THIS_MODULE, 207 .owner = THIS_MODULE,
211 .open = rcudata_csv_open, 208 .open = rcuexp_open,
212 .read = seq_read, 209 .read = seq_read,
213 .llseek = seq_lseek, 210 .llseek = no_llseek,
214 .release = single_release, 211 .release = seq_release,
215}; 212};
216 213
217#ifdef CONFIG_RCU_BOOST 214#ifdef CONFIG_RCU_BOOST
@@ -257,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
257 .owner = THIS_MODULE, 254 .owner = THIS_MODULE,
258 .open = rcu_node_boost_open, 255 .open = rcu_node_boost_open,
259 .read = seq_read, 256 .read = seq_read,
260 .llseek = seq_lseek, 257 .llseek = no_llseek,
261 .release = single_release, 258 .release = single_release,
262}; 259};
263 260
264/* 261#endif /* #ifdef CONFIG_RCU_BOOST */
265 * Create the rcuboost debugfs entry. Standard error return.
266 */
267static int rcu_boost_trace_create_file(struct dentry *rcudir)
268{
269 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
270 &rcu_node_boost_fops);
271}
272
273#else /* #ifdef CONFIG_RCU_BOOST */
274
275static int rcu_boost_trace_create_file(struct dentry *rcudir)
276{
277 return 0; /* There cannot be an error if we didn't create it! */
278}
279
280#endif /* #else #ifdef CONFIG_RCU_BOOST */
281 262
282static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283{ 264{
@@ -286,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
286 struct rcu_node *rnp; 267 struct rcu_node *rnp;
287 268
288 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
289 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
290 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 271 ulong2long(rsp->completed), ulong2long(gpnum),
272 rsp->fqs_state,
291 (long)(rsp->jiffies_force_qs - jiffies), 273 (long)(rsp->jiffies_force_qs - jiffies),
292 (int)(jiffies & 0xffff)); 274 (int)(jiffies & 0xffff));
293 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -309,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
309 seq_puts(m, "\n"); 291 seq_puts(m, "\n");
310} 292}
311 293
312static int show_rcuhier(struct seq_file *m, void *unused) 294static int show_rcuhier(struct seq_file *m, void *v)
313{ 295{
314 struct rcu_state *rsp; 296 struct rcu_state *rsp = (struct rcu_state *)m->private;
315 297 print_one_rcu_state(m, rsp);
316 for_each_rcu_flavor(rsp)
317 print_one_rcu_state(m, rsp);
318 return 0; 298 return 0;
319} 299}
320 300
321static int rcuhier_open(struct inode *inode, struct file *file) 301static int rcuhier_open(struct inode *inode, struct file *file)
322{ 302{
323 return single_open(file, show_rcuhier, NULL); 303 return single_open(file, show_rcuhier, inode->i_private);
324} 304}
325 305
326static const struct file_operations rcuhier_fops = { 306static const struct file_operations rcuhier_fops = {
327 .owner = THIS_MODULE, 307 .owner = THIS_MODULE,
328 .open = rcuhier_open, 308 .open = rcuhier_open,
329 .read = seq_read, 309 .read = seq_read,
330 .llseek = seq_lseek, 310 .llseek = no_llseek,
331 .release = single_release, 311 .release = seq_release,
332}; 312};
333 313
334static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -341,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
341 struct rcu_node *rnp = &rsp->node[0]; 321 struct rcu_node *rnp = &rsp->node[0];
342 322
343 raw_spin_lock_irqsave(&rnp->lock, flags); 323 raw_spin_lock_irqsave(&rnp->lock, flags);
344 completed = rsp->completed; 324 completed = ACCESS_ONCE(rsp->completed);
345 gpnum = rsp->gpnum; 325 gpnum = ACCESS_ONCE(rsp->gpnum);
346 if (rsp->completed == rsp->gpnum) 326 if (completed == gpnum)
347 gpage = 0; 327 gpage = 0;
348 else 328 else
349 gpage = jiffies - rsp->gp_start; 329 gpage = jiffies - rsp->gp_start;
350 gpmax = rsp->gp_max; 330 gpmax = rsp->gp_max;
351 raw_spin_unlock_irqrestore(&rnp->lock, flags); 331 raw_spin_unlock_irqrestore(&rnp->lock, flags);
352 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
353 rsp->name, completed, gpnum, gpage, gpmax); 333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
354} 334}
355 335
356static int show_rcugp(struct seq_file *m, void *unused) 336static int show_rcugp(struct seq_file *m, void *v)
357{ 337{
358 struct rcu_state *rsp; 338 struct rcu_state *rsp = (struct rcu_state *)m->private;
359 339 show_one_rcugp(m, rsp);
360 for_each_rcu_flavor(rsp)
361 show_one_rcugp(m, rsp);
362 return 0; 340 return 0;
363} 341}
364 342
365static int rcugp_open(struct inode *inode, struct file *file) 343static int rcugp_open(struct inode *inode, struct file *file)
366{ 344{
367 return single_open(file, show_rcugp, NULL); 345 return single_open(file, show_rcugp, inode->i_private);
368} 346}
369 347
370static const struct file_operations rcugp_fops = { 348static const struct file_operations rcugp_fops = {
371 .owner = THIS_MODULE, 349 .owner = THIS_MODULE,
372 .open = rcugp_open, 350 .open = rcugp_open,
373 .read = seq_read, 351 .read = seq_read,
374 .llseek = seq_lseek, 352 .llseek = no_llseek,
375 .release = single_release, 353 .release = seq_release,
376}; 354};
377 355
378static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
379{ 357{
358 if (!rdp->beenonline)
359 return;
380 seq_printf(m, "%3d%cnp=%ld ", 360 seq_printf(m, "%3d%cnp=%ld ",
381 rdp->cpu, 361 rdp->cpu,
382 cpu_is_offline(rdp->cpu) ? '!' : ' ', 362 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -386,41 +366,36 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
386 rdp->n_rp_report_qs, 366 rdp->n_rp_report_qs,
387 rdp->n_rp_cb_ready, 367 rdp->n_rp_cb_ready,
388 rdp->n_rp_cpu_needs_gp); 368 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", 369 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
390 rdp->n_rp_gp_completed, 370 rdp->n_rp_gp_completed,
391 rdp->n_rp_gp_started, 371 rdp->n_rp_gp_started,
392 rdp->n_rp_need_fqs,
393 rdp->n_rp_need_nothing); 372 rdp->n_rp_need_nothing);
394} 373}
395 374
396static int show_rcu_pending(struct seq_file *m, void *unused) 375static int show_rcu_pending(struct seq_file *m, void *v)
397{ 376{
398 int cpu; 377 print_one_rcu_pending(m, (struct rcu_data *)v);
399 struct rcu_data *rdp;
400 struct rcu_state *rsp;
401
402 for_each_rcu_flavor(rsp) {
403 seq_printf(m, "%s:\n", rsp->name);
404 for_each_possible_cpu(cpu) {
405 rdp = per_cpu_ptr(rsp->rda, cpu);
406 if (rdp->beenonline)
407 print_one_rcu_pending(m, rdp);
408 }
409 }
410 return 0; 378 return 0;
411} 379}
412 380
381static const struct seq_operations rcu_pending_op = {
382 .start = r_start,
383 .next = r_next,
384 .stop = r_stop,
385 .show = show_rcu_pending,
386};
387
413static int rcu_pending_open(struct inode *inode, struct file *file) 388static int rcu_pending_open(struct inode *inode, struct file *file)
414{ 389{
415 return single_open(file, show_rcu_pending, NULL); 390 return r_open(inode, file, &rcu_pending_op);
416} 391}
417 392
418static const struct file_operations rcu_pending_fops = { 393static const struct file_operations rcu_pending_fops = {
419 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
420 .open = rcu_pending_open, 395 .open = rcu_pending_open,
421 .read = seq_read, 396 .read = seq_read,
422 .llseek = seq_lseek, 397 .llseek = no_llseek,
423 .release = single_release, 398 .release = seq_release,
424}; 399};
425 400
426static int show_rcutorture(struct seq_file *m, void *unused) 401static int show_rcutorture(struct seq_file *m, void *unused)
@@ -450,43 +425,58 @@ static struct dentry *rcudir;
450 425
451static int __init rcutree_trace_init(void) 426static int __init rcutree_trace_init(void)
452{ 427{
428 struct rcu_state *rsp;
453 struct dentry *retval; 429 struct dentry *retval;
430 struct dentry *rspdir;
454 431
455 rcudir = debugfs_create_dir("rcu", NULL); 432 rcudir = debugfs_create_dir("rcu", NULL);
456 if (!rcudir) 433 if (!rcudir)
457 goto free_out; 434 goto free_out;
458 435
459 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 436 for_each_rcu_flavor(rsp) {
460 NULL, &rcubarrier_fops); 437 rspdir = debugfs_create_dir(rsp->name, rcudir);
461 if (!retval) 438 if (!rspdir)
462 goto free_out; 439 goto free_out;
463 440
464 retval = debugfs_create_file("rcudata", 0444, rcudir, 441 retval = debugfs_create_file("rcudata", 0444,
465 NULL, &rcudata_fops); 442 rspdir, rsp, &rcudata_fops);
466 if (!retval) 443 if (!retval)
467 goto free_out; 444 goto free_out;
468 445
469 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 446 retval = debugfs_create_file("rcuexp", 0444,
470 NULL, &rcudata_csv_fops); 447 rspdir, rsp, &rcuexp_fops);
471 if (!retval) 448 if (!retval)
472 goto free_out; 449 goto free_out;
473 450
474 if (rcu_boost_trace_create_file(rcudir)) 451 retval = debugfs_create_file("rcu_pending", 0444,
475 goto free_out; 452 rspdir, rsp, &rcu_pending_fops);
453 if (!retval)
454 goto free_out;
455
456 retval = debugfs_create_file("rcubarrier", 0444,
457 rspdir, rsp, &rcubarrier_fops);
458 if (!retval)
459 goto free_out;
476 460
477 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 461#ifdef CONFIG_RCU_BOOST
478 if (!retval) 462 if (rsp == &rcu_preempt_state) {
479 goto free_out; 463 retval = debugfs_create_file("rcuboost", 0444,
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
480 469
481 retval = debugfs_create_file("rcuhier", 0444, rcudir, 470 retval = debugfs_create_file("rcugp", 0444,
482 NULL, &rcuhier_fops); 471 rspdir, rsp, &rcugp_fops);
483 if (!retval) 472 if (!retval)
484 goto free_out; 473 goto free_out;
485 474
486 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 475 retval = debugfs_create_file("rcuhier", 0444,
487 NULL, &rcu_pending_fops); 476 rspdir, rsp, &rcuhier_fops);
488 if (!retval) 477 if (!retval)
489 goto free_out; 478 goto free_out;
479 }
490 480
491 retval = debugfs_create_file("rcutorture", 0444, rcudir, 481 retval = debugfs_create_file("rcutorture", 0444, rcudir,
492 NULL, &rcutorture_fops); 482 NULL, &rcutorture_fops);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 86 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 87}
88 88
89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 90{
91 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 92 val = counter->usage;
93 93
94 counter->usage -= val; 94 counter->usage -= val;
95 return counter->usage;
95} 96}
96 97
97void res_counter_uncharge_until(struct res_counter *counter, 98u64 res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top, 99 struct res_counter *top,
99 unsigned long val) 100 unsigned long val)
100{ 101{
101 unsigned long flags; 102 unsigned long flags;
102 struct res_counter *c; 103 struct res_counter *c;
104 u64 ret = 0;
103 105
104 local_irq_save(flags); 106 local_irq_save(flags);
105 for (c = counter; c != top; c = c->parent) { 107 for (c = counter; c != top; c = c->parent) {
108 u64 r;
106 spin_lock(&c->lock); 109 spin_lock(&c->lock);
107 res_counter_uncharge_locked(c, val); 110 r = res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
108 spin_unlock(&c->lock); 113 spin_unlock(&c->lock);
109 } 114 }
110 local_irq_restore(flags); 115 local_irq_restore(flags);
116 return ret;
111} 117}
112 118
113void res_counter_uncharge(struct res_counter *counter, unsigned long val) 119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{ 120{
115 res_counter_uncharge_until(counter, NULL, val); 121 return res_counter_uncharge_until(counter, NULL, val);
116} 122}
117 123
118static inline unsigned long long * 124static inline unsigned long long *
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf,
192 *res = PAGE_ALIGN(*res); 198 *res = PAGE_ALIGN(*res);
193 return 0; 199 return 0;
194} 200}
195
196int res_counter_write(struct res_counter *counter, int member,
197 const char *buf, write_strategy_fn write_strategy)
198{
199 char *end;
200 unsigned long flags;
201 unsigned long long tmp, *val;
202
203 if (write_strategy) {
204 if (write_strategy(buf, &tmp))
205 return -EINVAL;
206 } else {
207 tmp = simple_strtoull(buf, &end, 10);
208 if (*end != '\0')
209 return -EINVAL;
210 }
211 spin_lock_irqsave(&counter->lock, flags);
212 val = res_counter_member(counter, member);
213 *val = tmp;
214 spin_unlock_irqrestore(&counter->lock, flags);
215 return 0;
216}
diff --git a/kernel/resource.c b/kernel/resource.c
index 34d45886ee84..73f35d4b30b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
763 struct resource *parent = root; 763 struct resource *parent = root;
764 struct resource *conflict; 764 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
766 struct resource *next_res = NULL;
766 767
767 if (!res) 768 if (!res)
768 return; 769 return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
772 res->end = end; 773 res->end = end;
773 res->flags = IORESOURCE_BUSY; 774 res->flags = IORESOURCE_BUSY;
774 775
775 conflict = __request_resource(parent, res); 776 while (1) {
776 if (!conflict)
777 return;
778 777
779 /* failed, split and try again */ 778 conflict = __request_resource(parent, res);
780 kfree(res); 779 if (!conflict) {
780 if (!next_res)
781 break;
782 res = next_res;
783 next_res = NULL;
784 continue;
785 }
781 786
782 /* conflict covered whole area */ 787 /* conflict covered whole area */
783 if (conflict->start <= start && conflict->end >= end) 788 if (conflict->start <= res->start &&
784 return; 789 conflict->end >= res->end) {
790 kfree(res);
791 WARN_ON(next_res);
792 break;
793 }
794
795 /* failed, split and try again */
796 if (conflict->start > res->start) {
797 end = res->end;
798 res->end = conflict->start - 1;
799 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res),
801 GFP_ATOMIC);
802 if (!next_res) {
803 kfree(res);
804 break;
805 }
806 next_res->name = name;
807 next_res->start = conflict->end + 1;
808 next_res->end = end;
809 next_res->flags = IORESOURCE_BUSY;
810 }
811 } else {
812 res->start = conflict->end + 1;
813 }
814 }
785 815
786 if (conflict->start > start)
787 __reserve_region_with_split(root, start, conflict->start-1, name);
788 if (conflict->end < end)
789 __reserve_region_with_split(root, conflict->end+1, end, name);
790} 816}
791 817
792void __init reserve_region_with_split(struct resource *root, 818void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52f3af0..f06d249e103b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 649c9f876cb1..257002c13bb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
75 76
76#include <asm/switch_to.h> 77#include <asm/switch_to.h>
77#include <asm/tlb.h> 78#include <asm/tlb.h>
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { }; 193static void sched_feat_enable(int i) { };
193#endif /* HAVE_JUMP_LABEL */ 194#endif /* HAVE_JUMP_LABEL */
194 195
195static ssize_t 196static int sched_feat_set(char *cmp)
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{ 197{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i; 198 int i;
203 199 int neg = 0;
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212 200
213 if (strncmp(cmp, "NO_", 3) == 0) { 201 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1; 202 neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
228 } 216 }
229 } 217 }
230 218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
231 if (i == __SCHED_FEAT_NR) 240 if (i == __SCHED_FEAT_NR)
232 return -EINVAL; 241 return -EINVAL;
233 242
@@ -505,7 +514,7 @@ static inline void init_hrtick(void)
505#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
506 515
507#ifndef tsk_is_polling 516#ifndef tsk_is_polling
508#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 517#define tsk_is_polling(t) 0
509#endif 518#endif
510 519
511void resched_task(struct task_struct *p) 520void resched_task(struct task_struct *p)
@@ -740,126 +749,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
740 dequeue_task(rq, p, flags); 749 dequeue_task(rq, p, flags);
741} 750}
742 751
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745/*
746 * There are no locks covering percpu hardirq/softirq time.
747 * They are only modified in account_system_vtime, on corresponding CPU
748 * with interrupts disabled. So, writes are safe.
749 * They are read and saved off onto struct rq in update_rq_clock().
750 * This may result in other CPU reading this CPU's irq time and can
751 * race with irq/account_system_vtime on this CPU. We would either get old
752 * or new value with a side effect of accounting a slice of irq time to wrong
753 * task when irq is in progress while we read rq->clock. That is a worthy
754 * compromise in place of having locks on each irq in account_system_time.
755 */
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else /* CONFIG_64BIT */
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif /* CONFIG_64BIT */
814
815/*
816 * Called before incrementing preempt_count on {soft,}irq_enter
817 * and before decrementing preempt_count on {soft,}irq_exit.
818 */
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835 /*
836 * We do not account for softirq time from ksoftirqd here.
837 * We want to continue accounting softirq time to ksoftirqd thread
838 * in that case, so as not to confuse scheduler with a special task
839 * that do not consume any time, but still wants to run.
840 */
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta) 752static void update_rq_clock_task(struct rq *rq, s64 delta)
864{ 753{
865/* 754/*
@@ -920,43 +809,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
920#endif 809#endif
921} 810}
922 811
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else /* CONFIG_IRQ_TIME_ACCOUNTING */
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop) 812void sched_set_stop_task(int cpu, struct task_struct *stop)
961{ 813{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 814 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1079,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1079 rq->skip_clock_update = 1; 931 rq->skip_clock_update = 1;
1080} 932}
1081 933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
1082#ifdef CONFIG_SMP 941#ifdef CONFIG_SMP
1083void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1084{ 943{
@@ -1109,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1109 trace_sched_migrate_task(p, new_cpu); 968 trace_sched_migrate_task(p, new_cpu);
1110 969
1111 if (task_cpu(p) != new_cpu) { 970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
1112 p->se.nr_migrations++; 975 p->se.nr_migrations++;
1113 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1114 } 983 }
1115 984
1116 __set_task_cpu(p, new_cpu); 985 __set_task_cpu(p, new_cpu);
@@ -1518,25 +1387,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1518 smp_send_reschedule(cpu); 1387 smp_send_reschedule(cpu);
1519} 1388}
1520 1389
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu) 1390bool cpus_share_cache(int this_cpu, int that_cpu)
1541{ 1391{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1392 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1597,21 +1447,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1597 * If the owning (remote) cpu is still in the middle of schedule() with 1447 * If the owning (remote) cpu is still in the middle of schedule() with
1598 * this task as prev, wait until its done referencing the task. 1448 * this task as prev, wait until its done referencing the task.
1599 */ 1449 */
1600 while (p->on_cpu) { 1450 while (p->on_cpu)
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602 /*
1603 * In case the architecture enables interrupts in
1604 * context_switch(), we cannot busy wait, since that
1605 * would lead to deadlocks when an interrupt hits and
1606 * tries to wake up @prev. So bail and do a complete
1607 * remote wakeup.
1608 */
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax(); 1451 cpu_relax();
1613#endif
1614 }
1615 /* 1452 /*
1616 * Pairs with the smp_wmb() in finish_lock_switch(). 1453 * Pairs with the smp_wmb() in finish_lock_switch().
1617 */ 1454 */
@@ -1713,6 +1550,15 @@ static void __sched_fork(struct task_struct *p)
1713 p->se.vruntime = 0; 1550 p->se.vruntime = 0;
1714 INIT_LIST_HEAD(&p->se.group_node); 1551 INIT_LIST_HEAD(&p->se.group_node);
1715 1552
1553/*
1554 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1555 * removed when useful for applications beyond shares distribution (e.g.
1556 * load-balance).
1557 */
1558#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1559 p->se.avg.runnable_avg_period = 0;
1560 p->se.avg.runnable_avg_sum = 0;
1561#endif
1716#ifdef CONFIG_SCHEDSTATS 1562#ifdef CONFIG_SCHEDSTATS
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1563 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1564#endif
@@ -1722,8 +1568,41 @@ static void __sched_fork(struct task_struct *p)
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1723 INIT_HLIST_HEAD(&p->preempt_notifiers); 1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1724#endif 1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1725} 1585}
1726 1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1605
1727/* 1606/*
1728 * fork()/clone()-time setup: 1607 * fork()/clone()-time setup:
1729 */ 1608 */
@@ -1953,14 +1832,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1953 * Manfred Spraul <manfred@colorfullife.com> 1832 * Manfred Spraul <manfred@colorfullife.com>
1954 */ 1833 */
1955 prev_state = prev->state; 1834 prev_state = prev->state;
1835 vtime_task_switch(prev);
1956 finish_arch_switch(prev); 1836 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1960 perf_event_task_sched_in(prev, current); 1837 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1964 finish_lock_switch(rq, prev); 1838 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch(); 1839 finish_arch_post_lock_switch();
1966 1840
@@ -2080,6 +1954,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2080 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1954 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2081#endif 1955#endif
2082 1956
1957 context_tracking_task_switch(prev, next);
2083 /* Here we just switch the register state and the stack. */ 1958 /* Here we just switch the register state and the stack. */
2084 switch_to(prev, next, prev); 1959 switch_to(prev, next, prev);
2085 1960
@@ -2809,404 +2684,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2809 return ns; 2684 return ns;
2810} 2685}
2811 2686
2812#ifdef CONFIG_CGROUP_CPUACCT
2813struct cgroup_subsys cpuacct_subsys;
2814struct cpuacct root_cpuacct;
2815#endif
2816
2817static inline void task_group_account_field(struct task_struct *p, int index,
2818 u64 tmp)
2819{
2820#ifdef CONFIG_CGROUP_CPUACCT
2821 struct kernel_cpustat *kcpustat;
2822 struct cpuacct *ca;
2823#endif
2824 /*
2825 * Since all updates are sure to touch the root cgroup, we
2826 * get ourselves ahead and touch it first. If the root cgroup
2827 * is the only cgroup, then nothing else should be necessary.
2828 *
2829 */
2830 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2831
2832#ifdef CONFIG_CGROUP_CPUACCT
2833 if (unlikely(!cpuacct_subsys.active))
2834 return;
2835
2836 rcu_read_lock();
2837 ca = task_ca(p);
2838 while (ca && (ca != &root_cpuacct)) {
2839 kcpustat = this_cpu_ptr(ca->cpustat);
2840 kcpustat->cpustat[index] += tmp;
2841 ca = parent_ca(ca);
2842 }
2843 rcu_read_unlock();
2844#endif
2845}
2846
2847
2848/*
2849 * Account user cpu time to a process.
2850 * @p: the process that the cpu time gets accounted to
2851 * @cputime: the cpu time spent in user space since the last update
2852 * @cputime_scaled: cputime scaled by cpu frequency
2853 */
2854void account_user_time(struct task_struct *p, cputime_t cputime,
2855 cputime_t cputime_scaled)
2856{
2857 int index;
2858
2859 /* Add user time to process. */
2860 p->utime += cputime;
2861 p->utimescaled += cputime_scaled;
2862 account_group_user_time(p, cputime);
2863
2864 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2865
2866 /* Add user time to cpustat. */
2867 task_group_account_field(p, index, (__force u64) cputime);
2868
2869 /* Account for user time used */
2870 acct_update_integrals(p);
2871}
2872
2873/*
2874 * Account guest cpu time to a process.
2875 * @p: the process that the cpu time gets accounted to
2876 * @cputime: the cpu time spent in virtual machine since the last update
2877 * @cputime_scaled: cputime scaled by cpu frequency
2878 */
2879static void account_guest_time(struct task_struct *p, cputime_t cputime,
2880 cputime_t cputime_scaled)
2881{
2882 u64 *cpustat = kcpustat_this_cpu->cpustat;
2883
2884 /* Add guest time to process. */
2885 p->utime += cputime;
2886 p->utimescaled += cputime_scaled;
2887 account_group_user_time(p, cputime);
2888 p->gtime += cputime;
2889
2890 /* Add guest time to cpustat. */
2891 if (TASK_NICE(p) > 0) {
2892 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2893 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2894 } else {
2895 cpustat[CPUTIME_USER] += (__force u64) cputime;
2896 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2897 }
2898}
2899
2900/*
2901 * Account system cpu time to a process and desired cpustat field
2902 * @p: the process that the cpu time gets accounted to
2903 * @cputime: the cpu time spent in kernel space since the last update
2904 * @cputime_scaled: cputime scaled by cpu frequency
2905 * @target_cputime64: pointer to cpustat field that has to be updated
2906 */
2907static inline
2908void __account_system_time(struct task_struct *p, cputime_t cputime,
2909 cputime_t cputime_scaled, int index)
2910{
2911 /* Add system time to process. */
2912 p->stime += cputime;
2913 p->stimescaled += cputime_scaled;
2914 account_group_system_time(p, cputime);
2915
2916 /* Add system time to cpustat. */
2917 task_group_account_field(p, index, (__force u64) cputime);
2918
2919 /* Account for system time used */
2920 acct_update_integrals(p);
2921}
2922
2923/*
2924 * Account system cpu time to a process.
2925 * @p: the process that the cpu time gets accounted to
2926 * @hardirq_offset: the offset to subtract from hardirq_count()
2927 * @cputime: the cpu time spent in kernel space since the last update
2928 * @cputime_scaled: cputime scaled by cpu frequency
2929 */
2930void account_system_time(struct task_struct *p, int hardirq_offset,
2931 cputime_t cputime, cputime_t cputime_scaled)
2932{
2933 int index;
2934
2935 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2936 account_guest_time(p, cputime, cputime_scaled);
2937 return;
2938 }
2939
2940 if (hardirq_count() - hardirq_offset)
2941 index = CPUTIME_IRQ;
2942 else if (in_serving_softirq())
2943 index = CPUTIME_SOFTIRQ;
2944 else
2945 index = CPUTIME_SYSTEM;
2946
2947 __account_system_time(p, cputime, cputime_scaled, index);
2948}
2949
2950/*
2951 * Account for involuntary wait time.
2952 * @cputime: the cpu time spent in involuntary wait
2953 */
2954void account_steal_time(cputime_t cputime)
2955{
2956 u64 *cpustat = kcpustat_this_cpu->cpustat;
2957
2958 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2959}
2960
2961/*
2962 * Account for idle time.
2963 * @cputime: the cpu time spent in idle wait
2964 */
2965void account_idle_time(cputime_t cputime)
2966{
2967 u64 *cpustat = kcpustat_this_cpu->cpustat;
2968 struct rq *rq = this_rq();
2969
2970 if (atomic_read(&rq->nr_iowait) > 0)
2971 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2972 else
2973 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2974}
2975
2976static __always_inline bool steal_account_process_tick(void)
2977{
2978#ifdef CONFIG_PARAVIRT
2979 if (static_key_false(&paravirt_steal_enabled)) {
2980 u64 steal, st = 0;
2981
2982 steal = paravirt_steal_clock(smp_processor_id());
2983 steal -= this_rq()->prev_steal_time;
2984
2985 st = steal_ticks(steal);
2986 this_rq()->prev_steal_time += st * TICK_NSEC;
2987
2988 account_steal_time(st);
2989 return st;
2990 }
2991#endif
2992 return false;
2993}
2994
2995#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2996
2997#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2998/*
2999 * Account a tick to a process and cpustat
3000 * @p: the process that the cpu time gets accounted to
3001 * @user_tick: is the tick from userspace
3002 * @rq: the pointer to rq
3003 *
3004 * Tick demultiplexing follows the order
3005 * - pending hardirq update
3006 * - pending softirq update
3007 * - user_time
3008 * - idle_time
3009 * - system time
3010 * - check for guest_time
3011 * - else account as system_time
3012 *
3013 * Check for hardirq is done both for system and user time as there is
3014 * no timer going off while we are on hardirq and hence we may never get an
3015 * opportunity to update it solely in system time.
3016 * p->stime and friends are only updated on system time and not on irq
3017 * softirq as those do not count in task exec_runtime any more.
3018 */
3019static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3020 struct rq *rq)
3021{
3022 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3023 u64 *cpustat = kcpustat_this_cpu->cpustat;
3024
3025 if (steal_account_process_tick())
3026 return;
3027
3028 if (irqtime_account_hi_update()) {
3029 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3030 } else if (irqtime_account_si_update()) {
3031 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3032 } else if (this_cpu_ksoftirqd() == p) {
3033 /*
3034 * ksoftirqd time do not get accounted in cpu_softirq_time.
3035 * So, we have to handle it separately here.
3036 * Also, p->stime needs to be updated for ksoftirqd.
3037 */
3038 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3039 CPUTIME_SOFTIRQ);
3040 } else if (user_tick) {
3041 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3042 } else if (p == rq->idle) {
3043 account_idle_time(cputime_one_jiffy);
3044 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3045 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3046 } else {
3047 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3048 CPUTIME_SYSTEM);
3049 }
3050}
3051
3052static void irqtime_account_idle_ticks(int ticks)
3053{
3054 int i;
3055 struct rq *rq = this_rq();
3056
3057 for (i = 0; i < ticks; i++)
3058 irqtime_account_process_tick(current, 0, rq);
3059}
3060#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3061static void irqtime_account_idle_ticks(int ticks) {}
3062static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3063 struct rq *rq) {}
3064#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3065
3066/*
3067 * Account a single tick of cpu time.
3068 * @p: the process that the cpu time gets accounted to
3069 * @user_tick: indicates if the tick is a user or a system tick
3070 */
3071void account_process_tick(struct task_struct *p, int user_tick)
3072{
3073 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3074 struct rq *rq = this_rq();
3075
3076 if (sched_clock_irqtime) {
3077 irqtime_account_process_tick(p, user_tick, rq);
3078 return;
3079 }
3080
3081 if (steal_account_process_tick())
3082 return;
3083
3084 if (user_tick)
3085 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3086 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3087 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3088 one_jiffy_scaled);
3089 else
3090 account_idle_time(cputime_one_jiffy);
3091}
3092
3093/*
3094 * Account multiple ticks of steal time.
3095 * @p: the process from which the cpu time has been stolen
3096 * @ticks: number of stolen ticks
3097 */
3098void account_steal_ticks(unsigned long ticks)
3099{
3100 account_steal_time(jiffies_to_cputime(ticks));
3101}
3102
3103/*
3104 * Account multiple ticks of idle time.
3105 * @ticks: number of stolen ticks
3106 */
3107void account_idle_ticks(unsigned long ticks)
3108{
3109
3110 if (sched_clock_irqtime) {
3111 irqtime_account_idle_ticks(ticks);
3112 return;
3113 }
3114
3115 account_idle_time(jiffies_to_cputime(ticks));
3116}
3117
3118#endif
3119
3120/*
3121 * Use precise platform statistics if available:
3122 */
3123#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3124void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3125{
3126 *ut = p->utime;
3127 *st = p->stime;
3128}
3129
3130void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3131{
3132 struct task_cputime cputime;
3133
3134 thread_group_cputime(p, &cputime);
3135
3136 *ut = cputime.utime;
3137 *st = cputime.stime;
3138}
3139#else
3140
3141#ifndef nsecs_to_cputime
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif
3144
3145static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3146{
3147 u64 temp = (__force u64) rtime;
3148
3149 temp *= (__force u64) utime;
3150
3151 if (sizeof(cputime_t) == 4)
3152 temp = div_u64(temp, (__force u32) total);
3153 else
3154 temp = div64_u64(temp, (__force u64) total);
3155
3156 return (__force cputime_t) temp;
3157}
3158
3159void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3160{
3161 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3162
3163 /*
3164 * Use CFS's precise accounting:
3165 */
3166 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3167
3168 if (total)
3169 utime = scale_utime(utime, rtime, total);
3170 else
3171 utime = rtime;
3172
3173 /*
3174 * Compare with previous values, to keep monotonicity:
3175 */
3176 p->prev_utime = max(p->prev_utime, utime);
3177 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3178
3179 *ut = p->prev_utime;
3180 *st = p->prev_stime;
3181}
3182
3183/*
3184 * Must be called with siglock held.
3185 */
3186void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3187{
3188 struct signal_struct *sig = p->signal;
3189 struct task_cputime cputime;
3190 cputime_t rtime, utime, total;
3191
3192 thread_group_cputime(p, &cputime);
3193
3194 total = cputime.utime + cputime.stime;
3195 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3196
3197 if (total)
3198 utime = scale_utime(cputime.utime, rtime, total);
3199 else
3200 utime = rtime;
3201
3202 sig->prev_utime = max(sig->prev_utime, utime);
3203 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3204
3205 *ut = sig->prev_utime;
3206 *st = sig->prev_stime;
3207}
3208#endif
3209
3210/* 2687/*
3211 * This function gets called by the timer code, with HZ frequency. 2688 * This function gets called by the timer code, with HZ frequency.
3212 * We call it with interrupts disabled. 2689 * We call it with interrupts disabled.
@@ -3367,6 +2844,40 @@ pick_next_task(struct rq *rq)
3367 2844
3368/* 2845/*
3369 * __schedule() is the main scheduler function. 2846 * __schedule() is the main scheduler function.
2847 *
2848 * The main means of driving the scheduler and thus entering this function are:
2849 *
2850 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2851 *
2852 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2853 * paths. For example, see arch/x86/entry_64.S.
2854 *
2855 * To drive preemption between tasks, the scheduler sets the flag in timer
2856 * interrupt handler scheduler_tick().
2857 *
2858 * 3. Wakeups don't really cause entry into schedule(). They add a
2859 * task to the run-queue and that's it.
2860 *
2861 * Now, if the new task added to the run-queue preempts the current
2862 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2863 * called on the nearest possible occasion:
2864 *
2865 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2866 *
2867 * - in syscall or exception context, at the next outmost
2868 * preempt_enable(). (this might be as soon as the wake_up()'s
2869 * spin_unlock()!)
2870 *
2871 * - in IRQ context, return from interrupt-handler to
2872 * preemptible context
2873 *
2874 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2875 * then at the next:
2876 *
2877 * - cond_resched() call
2878 * - explicit schedule() call
2879 * - return from syscall or exception to user-space
2880 * - return from interrupt-handler to user-space
3370 */ 2881 */
3371static void __sched __schedule(void) 2882static void __sched __schedule(void)
3372{ 2883{
@@ -3468,6 +2979,21 @@ asmlinkage void __sched schedule(void)
3468} 2979}
3469EXPORT_SYMBOL(schedule); 2980EXPORT_SYMBOL(schedule);
3470 2981
2982#ifdef CONFIG_CONTEXT_TRACKING
2983asmlinkage void __sched schedule_user(void)
2984{
2985 /*
2986 * If we come here after a random call to set_need_resched(),
2987 * or we have been woken up remotely but the IPI has not yet arrived,
2988 * we haven't yet exited the RCU idle mode. Do it here manually until
2989 * we find a better solution.
2990 */
2991 user_exit();
2992 schedule();
2993 user_enter();
2994}
2995#endif
2996
3471/** 2997/**
3472 * schedule_preempt_disabled - called with preemption disabled 2998 * schedule_preempt_disabled - called with preemption disabled
3473 * 2999 *
@@ -3569,6 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3569 /* Catch callers which need to be fixed */ 3095 /* Catch callers which need to be fixed */
3570 BUG_ON(ti->preempt_count || !irqs_disabled()); 3096 BUG_ON(ti->preempt_count || !irqs_disabled());
3571 3097
3098 user_exit();
3572 do { 3099 do {
3573 add_preempt_count(PREEMPT_ACTIVE); 3100 add_preempt_count(PREEMPT_ACTIVE);
3574 local_irq_enable(); 3101 local_irq_enable();
@@ -4570,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4570 goto out_free_cpus_allowed; 4097 goto out_free_cpus_allowed;
4571 } 4098 }
4572 retval = -EPERM; 4099 retval = -EPERM;
4573 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4100 if (!check_same_owner(p)) {
4574 goto out_unlock; 4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4575 4108
4576 retval = security_task_setscheduler(p); 4109 retval = security_task_setscheduler(p);
4577 if (retval) 4110 if (retval)
@@ -4868,13 +4401,6 @@ again:
4868 */ 4401 */
4869 if (preempt && rq != p_rq) 4402 if (preempt && rq != p_rq)
4870 resched_task(p_rq->curr); 4403 resched_task(p_rq->curr);
4871 } else {
4872 /*
4873 * We might have set it in task_yield_fair(), but are
4874 * not going to schedule(), so don't want to skip
4875 * the next update.
4876 */
4877 rq->skip_clock_update = 0;
4878 } 4404 }
4879 4405
4880out: 4406out:
@@ -5022,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5022void sched_show_task(struct task_struct *p) 4548void sched_show_task(struct task_struct *p)
5023{ 4549{
5024 unsigned long free = 0; 4550 unsigned long free = 0;
4551 int ppid;
5025 unsigned state; 4552 unsigned state;
5026 4553
5027 state = p->state ? __ffs(p->state) + 1 : 0; 4554 state = p->state ? __ffs(p->state) + 1 : 0;
@@ -5041,8 +4568,11 @@ void sched_show_task(struct task_struct *p)
5041#ifdef CONFIG_DEBUG_STACK_USAGE 4568#ifdef CONFIG_DEBUG_STACK_USAGE
5042 free = stack_not_used(p); 4569 free = stack_not_used(p);
5043#endif 4570#endif
4571 rcu_read_lock();
4572 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4573 rcu_read_unlock();
5044 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4574 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5045 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4575 task_pid_nr(p), ppid,
5046 (unsigned long)task_thread_info(p)->flags); 4576 (unsigned long)task_thread_info(p)->flags);
5047 4577
5048 show_stack(p, NULL); 4578 show_stack(p, NULL);
@@ -5416,16 +4946,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5416 *tablep = NULL; 4946 *tablep = NULL;
5417} 4947}
5418 4948
4949static int min_load_idx = 0;
4950static int max_load_idx = CPU_LOAD_IDX_MAX;
4951
5419static void 4952static void
5420set_table_entry(struct ctl_table *entry, 4953set_table_entry(struct ctl_table *entry,
5421 const char *procname, void *data, int maxlen, 4954 const char *procname, void *data, int maxlen,
5422 umode_t mode, proc_handler *proc_handler) 4955 umode_t mode, proc_handler *proc_handler,
4956 bool load_idx)
5423{ 4957{
5424 entry->procname = procname; 4958 entry->procname = procname;
5425 entry->data = data; 4959 entry->data = data;
5426 entry->maxlen = maxlen; 4960 entry->maxlen = maxlen;
5427 entry->mode = mode; 4961 entry->mode = mode;
5428 entry->proc_handler = proc_handler; 4962 entry->proc_handler = proc_handler;
4963
4964 if (load_idx) {
4965 entry->extra1 = &min_load_idx;
4966 entry->extra2 = &max_load_idx;
4967 }
5429} 4968}
5430 4969
5431static struct ctl_table * 4970static struct ctl_table *
@@ -5437,30 +4976,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5437 return NULL; 4976 return NULL;
5438 4977
5439 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4978 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5440 sizeof(long), 0644, proc_doulongvec_minmax); 4979 sizeof(long), 0644, proc_doulongvec_minmax, false);
5441 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4980 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5442 sizeof(long), 0644, proc_doulongvec_minmax); 4981 sizeof(long), 0644, proc_doulongvec_minmax, false);
5443 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4982 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5444 sizeof(int), 0644, proc_dointvec_minmax); 4983 sizeof(int), 0644, proc_dointvec_minmax, true);
5445 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4984 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5446 sizeof(int), 0644, proc_dointvec_minmax); 4985 sizeof(int), 0644, proc_dointvec_minmax, true);
5447 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4986 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5448 sizeof(int), 0644, proc_dointvec_minmax); 4987 sizeof(int), 0644, proc_dointvec_minmax, true);
5449 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4988 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5450 sizeof(int), 0644, proc_dointvec_minmax); 4989 sizeof(int), 0644, proc_dointvec_minmax, true);
5451 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4990 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5452 sizeof(int), 0644, proc_dointvec_minmax); 4991 sizeof(int), 0644, proc_dointvec_minmax, true);
5453 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4992 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5454 sizeof(int), 0644, proc_dointvec_minmax); 4993 sizeof(int), 0644, proc_dointvec_minmax, false);
5455 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4994 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5456 sizeof(int), 0644, proc_dointvec_minmax); 4995 sizeof(int), 0644, proc_dointvec_minmax, false);
5457 set_table_entry(&table[9], "cache_nice_tries", 4996 set_table_entry(&table[9], "cache_nice_tries",
5458 &sd->cache_nice_tries, 4997 &sd->cache_nice_tries,
5459 sizeof(int), 0644, proc_dointvec_minmax); 4998 sizeof(int), 0644, proc_dointvec_minmax, false);
5460 set_table_entry(&table[10], "flags", &sd->flags, 4999 set_table_entry(&table[10], "flags", &sd->flags,
5461 sizeof(int), 0644, proc_dointvec_minmax); 5000 sizeof(int), 0644, proc_dointvec_minmax, false);
5462 set_table_entry(&table[11], "name", sd->name, 5001 set_table_entry(&table[11], "name", sd->name,
5463 CORENAME_MAX_SIZE, 0444, proc_dostring); 5002 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5464 /* &table[12] is terminator */ 5003 /* &table[12] is terminator */
5465 5004
5466 return table; 5005 return table;
@@ -5604,7 +5143,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5604 migrate_tasks(cpu); 5143 migrate_tasks(cpu);
5605 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5144 BUG_ON(rq->nr_running != 1); /* the migration thread */
5606 raw_spin_unlock_irqrestore(&rq->lock, flags); 5145 raw_spin_unlock_irqrestore(&rq->lock, flags);
5146 break;
5607 5147
5148 case CPU_DEAD:
5608 calc_load_migrate(rq); 5149 calc_load_migrate(rq);
5609 break; 5150 break;
5610#endif 5151#endif
@@ -6537,7 +6078,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6537 | 0*SD_BALANCE_FORK 6078 | 0*SD_BALANCE_FORK
6538 | 0*SD_BALANCE_WAKE 6079 | 0*SD_BALANCE_WAKE
6539 | 0*SD_WAKE_AFFINE 6080 | 0*SD_WAKE_AFFINE
6540 | 0*SD_PREFER_LOCAL
6541 | 0*SD_SHARE_CPUPOWER 6081 | 0*SD_SHARE_CPUPOWER
6542 | 0*SD_SHARE_PKG_RESOURCES 6082 | 0*SD_SHARE_PKG_RESOURCES
6543 | 1*SD_SERIALIZE 6083 | 1*SD_SERIALIZE
@@ -6660,6 +6200,17 @@ static void sched_init_numa(void)
6660 * numbers. 6200 * numbers.
6661 */ 6201 */
6662 6202
6203 /*
6204 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6205 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6206 * the array will contain less then 'level' members. This could be
6207 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6208 * in other functions.
6209 *
6210 * We reset it to 'level' at the end of this function.
6211 */
6212 sched_domains_numa_levels = 0;
6213
6663 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6214 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6664 if (!sched_domains_numa_masks) 6215 if (!sched_domains_numa_masks)
6665 return; 6216 return;
@@ -6714,11 +6265,68 @@ static void sched_init_numa(void)
6714 } 6265 }
6715 6266
6716 sched_domain_topology = tl; 6267 sched_domain_topology = tl;
6268
6269 sched_domains_numa_levels = level;
6270}
6271
6272static void sched_domains_numa_masks_set(int cpu)
6273{
6274 int i, j;
6275 int node = cpu_to_node(cpu);
6276
6277 for (i = 0; i < sched_domains_numa_levels; i++) {
6278 for (j = 0; j < nr_node_ids; j++) {
6279 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6280 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6281 }
6282 }
6283}
6284
6285static void sched_domains_numa_masks_clear(int cpu)
6286{
6287 int i, j;
6288 for (i = 0; i < sched_domains_numa_levels; i++) {
6289 for (j = 0; j < nr_node_ids; j++)
6290 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6291 }
6292}
6293
6294/*
6295 * Update sched_domains_numa_masks[level][node] array when new cpus
6296 * are onlined.
6297 */
6298static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6299 unsigned long action,
6300 void *hcpu)
6301{
6302 int cpu = (long)hcpu;
6303
6304 switch (action & ~CPU_TASKS_FROZEN) {
6305 case CPU_ONLINE:
6306 sched_domains_numa_masks_set(cpu);
6307 break;
6308
6309 case CPU_DEAD:
6310 sched_domains_numa_masks_clear(cpu);
6311 break;
6312
6313 default:
6314 return NOTIFY_DONE;
6315 }
6316
6317 return NOTIFY_OK;
6717} 6318}
6718#else 6319#else
6719static inline void sched_init_numa(void) 6320static inline void sched_init_numa(void)
6720{ 6321{
6721} 6322}
6323
6324static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6325 unsigned long action,
6326 void *hcpu)
6327{
6328 return 0;
6329}
6722#endif /* CONFIG_NUMA */ 6330#endif /* CONFIG_NUMA */
6723 6331
6724static int __sdt_alloc(const struct cpumask *cpu_map) 6332static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7167,6 +6775,7 @@ void __init sched_init_smp(void)
7167 mutex_unlock(&sched_domains_mutex); 6775 mutex_unlock(&sched_domains_mutex);
7168 put_online_cpus(); 6776 put_online_cpus();
7169 6777
6778 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
7170 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6779 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7171 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6780 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7172 6781
@@ -7937,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7937 struct task_group, css); 7546 struct task_group, css);
7938} 7547}
7939 7548
7940static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7549static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7941{ 7550{
7942 struct task_group *tg, *parent; 7551 struct task_group *tg, *parent;
7943 7552
@@ -7954,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7954 return &tg->css; 7563 return &tg->css;
7955} 7564}
7956 7565
7957static void cpu_cgroup_destroy(struct cgroup *cgrp) 7566static void cpu_cgroup_css_free(struct cgroup *cgrp)
7958{ 7567{
7959 struct task_group *tg = cgroup_tg(cgrp); 7568 struct task_group *tg = cgroup_tg(cgrp);
7960 7569
@@ -8314,8 +7923,8 @@ static struct cftype cpu_files[] = {
8314 7923
8315struct cgroup_subsys cpu_cgroup_subsys = { 7924struct cgroup_subsys cpu_cgroup_subsys = {
8316 .name = "cpu", 7925 .name = "cpu",
8317 .create = cpu_cgroup_create, 7926 .css_alloc = cpu_cgroup_css_alloc,
8318 .destroy = cpu_cgroup_destroy, 7927 .css_free = cpu_cgroup_css_free,
8319 .can_attach = cpu_cgroup_can_attach, 7928 .can_attach = cpu_cgroup_can_attach,
8320 .attach = cpu_cgroup_attach, 7929 .attach = cpu_cgroup_attach,
8321 .exit = cpu_cgroup_exit, 7930 .exit = cpu_cgroup_exit,
@@ -8335,8 +7944,10 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8335 * (balbir@in.ibm.com). 7944 * (balbir@in.ibm.com).
8336 */ 7945 */
8337 7946
7947struct cpuacct root_cpuacct;
7948
8338/* create a new cpu accounting group */ 7949/* create a new cpu accounting group */
8339static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7950static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8340{ 7951{
8341 struct cpuacct *ca; 7952 struct cpuacct *ca;
8342 7953
@@ -8366,7 +7977,7 @@ out:
8366} 7977}
8367 7978
8368/* destroy an existing cpu accounting group */ 7979/* destroy an existing cpu accounting group */
8369static void cpuacct_destroy(struct cgroup *cgrp) 7980static void cpuacct_css_free(struct cgroup *cgrp)
8370{ 7981{
8371 struct cpuacct *ca = cgroup_ca(cgrp); 7982 struct cpuacct *ca = cgroup_ca(cgrp);
8372 7983
@@ -8537,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8537 8148
8538struct cgroup_subsys cpuacct_subsys = { 8149struct cgroup_subsys cpuacct_subsys = {
8539 .name = "cpuacct", 8150 .name = "cpuacct",
8540 .create = cpuacct_create, 8151 .css_alloc = cpuacct_css_alloc,
8541 .destroy = cpuacct_destroy, 8152 .css_free = cpuacct_css_free,
8542 .subsys_id = cpuacct_subsys_id, 8153 .subsys_id = cpuacct_subsys_id,
8543 .base_cftypes = files, 8154 .base_cftypes = files,
8544}; 8155};
8545#endif /* CONFIG_CGROUP_CPUACCT */ 8156#endif /* CONFIG_CGROUP_CPUACCT */
8157
8158void dump_cpu_task(int cpu)
8159{
8160 pr_info("Task dump for CPU %d:\n", cpu);
8161 sched_show_task(cpu_curr(cpu));
8162}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000000..293b202fcf79
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,589 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void irqtime_account_irq(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/*
323 * Account a tick to a process and cpustat
324 * @p: the process that the cpu time gets accounted to
325 * @user_tick: is the tick from userspace
326 * @rq: the pointer to rq
327 *
328 * Tick demultiplexing follows the order
329 * - pending hardirq update
330 * - pending softirq update
331 * - user_time
332 * - idle_time
333 * - system time
334 * - check for guest_time
335 * - else account as system_time
336 *
337 * Check for hardirq is done both for system and user time as there is
338 * no timer going off while we are on hardirq and hence we may never get an
339 * opportunity to update it solely in system time.
340 * p->stime and friends are only updated on system time and not on irq
341 * softirq as those do not count in task exec_runtime any more.
342 */
343static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
344 struct rq *rq)
345{
346 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
347 u64 *cpustat = kcpustat_this_cpu->cpustat;
348
349 if (steal_account_process_tick())
350 return;
351
352 if (irqtime_account_hi_update()) {
353 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
354 } else if (irqtime_account_si_update()) {
355 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
356 } else if (this_cpu_ksoftirqd() == p) {
357 /*
358 * ksoftirqd time do not get accounted in cpu_softirq_time.
359 * So, we have to handle it separately here.
360 * Also, p->stime needs to be updated for ksoftirqd.
361 */
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
363 CPUTIME_SOFTIRQ);
364 } else if (user_tick) {
365 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
366 } else if (p == rq->idle) {
367 account_idle_time(cputime_one_jiffy);
368 } else if (p->flags & PF_VCPU) { /* System time or guest time */
369 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
370 } else {
371 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
372 CPUTIME_SYSTEM);
373 }
374}
375
376static void irqtime_account_idle_ticks(int ticks)
377{
378 int i;
379 struct rq *rq = this_rq();
380
381 for (i = 0; i < ticks; i++)
382 irqtime_account_process_tick(current, 0, rq);
383}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389
390/*
391 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to
393 * @user_tick: indicates if the tick is a user or a system tick
394 */
395void account_process_tick(struct task_struct *p, int user_tick)
396{
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq();
399
400 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq);
402 return;
403 }
404
405 if (steal_account_process_tick())
406 return;
407
408 if (user_tick)
409 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
410 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
411 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
412 one_jiffy_scaled);
413 else
414 account_idle_time(cputime_one_jiffy);
415}
416
417/*
418 * Account multiple ticks of steal time.
419 * @p: the process from which the cpu time has been stolen
420 * @ticks: number of stolen ticks
421 */
422void account_steal_ticks(unsigned long ticks)
423{
424 account_steal_time(jiffies_to_cputime(ticks));
425}
426
427/*
428 * Account multiple ticks of idle time.
429 * @ticks: number of stolen ticks
430 */
431void account_idle_ticks(unsigned long ticks)
432{
433
434 if (sched_clock_irqtime) {
435 irqtime_account_idle_ticks(ticks);
436 return;
437 }
438
439 account_idle_time(jiffies_to_cputime(ticks));
440}
441
442#endif
443
444/*
445 * Use precise platform statistics if available:
446 */
447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
449{
450 *ut = p->utime;
451 *st = p->stime;
452}
453
454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
455{
456 struct task_cputime cputime;
457
458 thread_group_cputime(p, &cputime);
459
460 *ut = cputime.utime;
461 *st = cputime.stime;
462}
463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
487/*
488 * Archs that account the whole time spent in the idle task
489 * (outside irq) as idle time can rely on this and just implement
490 * vtime_account_system() and vtime_account_idle(). Archs that
491 * have other meaning of the idle time (s390 only includes the
492 * time spent by the CPU when it's in low power mode) must override
493 * vtime_account().
494 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk)
497{
498 if (in_interrupt() || !is_idle_task(tsk))
499 vtime_account_system(tsk);
500 else
501 vtime_account_idle(tsk);
502}
503EXPORT_SYMBOL_GPL(vtime_account);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505
506#else
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
513{
514 u64 temp = (__force u64) rtime;
515
516 temp *= (__force u64) utime;
517
518 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total);
520 else
521 temp = div64_u64(temp, (__force u64) total);
522
523 return (__force cputime_t) temp;
524}
525
526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
533{
534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
538
539 /*
540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
548 */
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550
551 if (total)
552 utime = scale_utime(utime, rtime, total);
553 else
554 utime = rtime;
555
556 /*
557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
560 */
561 prev->utime = max(prev->utime, utime);
562 prev->stime = max(prev->stime, rtime - prev->utime);
563
564 *ut = prev->utime;
565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577}
578
579/*
580 * Must be called with siglock held.
581 */
582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
583{
584 struct task_cputime cputime;
585
586 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588}
589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{ 62{
63 struct sched_entity *se = tg->se[cpu]; 63 struct sched_entity *se = tg->se[cpu];
64 if (!se)
65 return;
66 64
67#define P(F) \ 65#define P(F) \
68 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
69#define PN(F) \ 67#define PN(F) \
70 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
71 69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
72 PN(se->exec_start); 78 PN(se->exec_start);
73 PN(se->vruntime); 79 PN(se->vruntime);
74 PN(se->sum_exec_runtime); 80 PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
85 P(se->statistics.wait_count); 91 P(se->statistics.wait_count);
86#endif 92#endif
87 P(se->load.weight); 93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
88#undef PN 100#undef PN
89#undef P 101#undef P
90} 102}
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 219#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 220#ifdef CONFIG_SMP
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", 221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
210 SPLIT_NS(cfs_rq->load_avg)); 222 cfs_rq->runnable_load_avg);
211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", 223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
212 SPLIT_NS(cfs_rq->load_period)); 224 cfs_rq->blocked_load_avg);
213 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
214 cfs_rq->load_contribution); 226 atomic64_read(&cfs_rq->tg->load_avg));
215 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
216 atomic_read(&cfs_rq->tg->load_weight)); 228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
217#endif 233#endif
218 234
219 print_cfs_group_stats(m, cpu, cfs_rq->tg); 235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96e2b18b6283..5eea8707234a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 262 return grp->my_q;
260} 263}
261 264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{ 269{
264 if (!cfs_rq->on_list) { 270 if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
278 } 284 }
279 285
280 cfs_rq->on_list = 1; 286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
281 } 289 }
282} 290}
283 291
@@ -597,7 +605,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
597/* 605/*
598 * The idea is to set a period in which each task runs once. 606 * The idea is to set a period in which each task runs once.
599 * 607 *
600 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 608 * When there are too many tasks (sched_nr_latency) we have to stretch
601 * this period because otherwise the slices get too small. 609 * this period because otherwise the slices get too small.
602 * 610 *
603 * p = (nr <= nl) ? l : l*nr/nl 611 * p = (nr <= nl) ? l : l*nr/nl
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se); 661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654} 662}
655 663
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659/* 664/*
660 * Update the current task's runtime statistics. Skip current tasks that 665 * Update the current task's runtime statistics. Skip current tasks that
661 * are not in our scheduling class. 666 * are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
675 680
676 curr->vruntime += delta_exec_weighted; 681 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq); 682 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682} 683}
683 684
684static void update_curr(struct cfs_rq *cfs_rq) 685static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
777 */ 778 */
778 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq;
797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 if (p->numa_scan_seq == seq)
802 return;
803 p->numa_scan_seq = seq;
804
805 /* FIXME: Scheduling placement policy hints go here */
806}
807
808/*
809 * Got a PROT_NONE fault for a page on @node.
810 */
811void task_numa_fault(int node, int pages, bool migrated)
812{
813 struct task_struct *p = current;
814
815 if (!sched_feat_numa(NUMA))
816 return;
817
818 /* FIXME: Allocate task-specific structure for placement policy here */
819
820 /*
821 * If pages are properly placed (did not migrate) then scan slower.
822 * This is reset periodically in case of phase changes
823 */
824 if (!migrated)
825 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 p->numa_scan_period + jiffies_to_msecs(10));
827
828 task_numa_placement(p);
829}
830
831static void reset_ptenuma_scan(struct task_struct *p)
832{
833 ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 p->mm->numa_scan_offset = 0;
835}
836
837/*
838 * The expensive part of numa migration is done from task_work context.
839 * Triggered from task_tick_numa().
840 */
841void task_numa_work(struct callback_head *work)
842{
843 unsigned long migrate, next_scan, now = jiffies;
844 struct task_struct *p = current;
845 struct mm_struct *mm = p->mm;
846 struct vm_area_struct *vma;
847 unsigned long start, end;
848 long pages;
849
850 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851
852 work->next = work; /* protect against double add */
853 /*
854 * Who cares about NUMA placement when they're dying.
855 *
856 * NOTE: make sure not to dereference p->mm before this check,
857 * exit_task_work() happens _after_ exit_mm() so we could be called
858 * without p->mm even though we still had it when we enqueued this
859 * work.
860 */
861 if (p->flags & PF_EXITING)
862 return;
863
864 /*
865 * We do not care about task placement until a task runs on a node
866 * other than the first one used by the address space. This is
867 * largely because migrations are driven by what CPU the task
868 * is running on. If it's never scheduled on another node, it'll
869 * not migrate so why bother trapping the fault.
870 */
871 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 mm->first_nid = numa_node_id();
873 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 /* Are we running on a new node yet? */
875 if (numa_node_id() == mm->first_nid &&
876 !sched_feat_numa(NUMA_FORCE))
877 return;
878
879 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 }
881
882 /*
883 * Reset the scan period if enough time has gone by. Objective is that
884 * scanning will be reduced if pages are properly placed. As tasks
885 * can enter different phases this needs to be re-examined. Lacking
886 * proper tracking of reference behaviour, this blunt hammer is used.
887 */
888 migrate = mm->numa_next_reset;
889 if (time_after(now, migrate)) {
890 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 xchg(&mm->numa_next_reset, next_scan);
893 }
894
895 /*
896 * Enforce maximal scan/migration frequency..
897 */
898 migrate = mm->numa_next_scan;
899 if (time_before(now, migrate))
900 return;
901
902 if (p->numa_scan_period == 0)
903 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904
905 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 return;
908
909 /*
910 * Do not set pte_numa if the current running node is rate-limited.
911 * This loses statistics on the fault but if we are unwilling to
912 * migrate to this node, it is less likely we can do useful work
913 */
914 if (migrate_ratelimited(numa_node_id()))
915 return;
916
917 start = mm->numa_scan_offset;
918 pages = sysctl_numa_balancing_scan_size;
919 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 if (!pages)
921 return;
922
923 down_read(&mm->mmap_sem);
924 vma = find_vma(mm, start);
925 if (!vma) {
926 reset_ptenuma_scan(p);
927 start = 0;
928 vma = mm->mmap;
929 }
930 for (; vma; vma = vma->vm_next) {
931 if (!vma_migratable(vma))
932 continue;
933
934 /* Skip small VMAs. They are not likely to be of relevance */
935 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 continue;
937
938 do {
939 start = max(start, vma->vm_start);
940 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 end = min(end, vma->vm_end);
942 pages -= change_prot_numa(vma, start, end);
943
944 start = end;
945 if (pages <= 0)
946 goto out;
947 } while (end != vma->vm_end);
948 }
949
950out:
951 /*
952 * It is possible to reach the end of the VMA list but the last few VMAs are
953 * not guaranteed to the vma_migratable. If they are not, we would find the
954 * !migratable VMA on the next scan but not reset the scanner to the start
955 * so check it now.
956 */
957 if (vma)
958 mm->numa_scan_offset = start;
959 else
960 reset_ptenuma_scan(p);
961 up_read(&mm->mmap_sem);
962}
963
964/*
965 * Drive the periodic memory faults..
966 */
967void task_tick_numa(struct rq *rq, struct task_struct *curr)
968{
969 struct callback_head *work = &curr->numa_work;
970 u64 period, now;
971
972 /*
973 * We don't care about NUMA placement if we don't have memory.
974 */
975 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 return;
977
978 /*
979 * Using runtime rather than walltime has the dual advantage that
980 * we (mostly) drive the selection from busy threads and that the
981 * task needs to have done some actual work before we bother with
982 * NUMA placement.
983 */
984 now = curr->se.sum_exec_runtime;
985 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986
987 if (now - curr->node_stamp > period) {
988 if (!curr->node_stamp)
989 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 curr->node_stamp = now;
991
992 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 task_work_add(curr, work, true);
995 }
996 }
997}
998#else
999static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000{
1001}
1002#endif /* CONFIG_NUMA_BALANCING */
1003
779static void 1004static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1005account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 1006{
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
801} 1026}
802 1027
803#ifdef CONFIG_FAIR_GROUP_SCHED 1028#ifdef CONFIG_FAIR_GROUP_SCHED
804/* we need this in update_cfs_load and load-balance functions below */
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP 1029# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834 /* truncate load history at 4 idle periods */
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850 /* consider updating load contribution on each fold or truncate */
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856 /*
857 * Inline assembly required to prevent the compiler
858 * optimising this loop into a divmod call.
859 * See __iter_div_u64_rem() for another example of this.
860 */
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 1030static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{ 1031{
872 long tg_weight; 1032 long tg_weight;
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
876 * to gain a more accurate current total weight. See 1036 * to gain a more accurate current total weight. See
877 * update_cfs_rq_load_contribution(). 1037 * update_cfs_rq_load_contribution().
878 */ 1038 */
879 tg_weight = atomic_read(&tg->load_weight); 1039 tg_weight = atomic64_read(&tg->load_avg);
880 tg_weight -= cfs_rq->load_contribution; 1040 tg_weight -= cfs_rq->tg_load_contrib;
881 tg_weight += cfs_rq->load.weight; 1041 tg_weight += cfs_rq->load.weight;
882 1042
883 return tg_weight; 1043 return tg_weight;
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
901 1061
902 return shares; 1062 return shares;
903} 1063}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else /* CONFIG_SMP */ 1064# else /* CONFIG_SMP */
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 1065static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{ 1066{
919 return tg->shares; 1067 return tg->shares;
920} 1068}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif /* CONFIG_SMP */ 1069# endif /* CONFIG_SMP */
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 1070static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight) 1071 unsigned long weight)
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
939 account_entity_enqueue(cfs_rq, se); 1083 account_entity_enqueue(cfs_rq, se);
940} 1084}
941 1085
1086static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087
942static void update_cfs_shares(struct cfs_rq *cfs_rq) 1088static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{ 1089{
944 struct task_group *tg; 1090 struct task_group *tg;
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
958 reweight_entity(cfs_rq_of(se), se, shares); 1104 reweight_entity(cfs_rq_of(se), se, shares);
959} 1105}
960#else /* CONFIG_FAIR_GROUP_SCHED */ 1106#else /* CONFIG_FAIR_GROUP_SCHED */
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 1107static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
962{ 1108{
963} 1109}
1110#endif /* CONFIG_FAIR_GROUP_SCHED */
964 1111
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq) 1112/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114/*
1115 * We choose a half-life close to 1 scheduling period.
1116 * Note: The tables below are dependent on this value.
1117 */
1118#define LOAD_AVG_PERIOD 32
1119#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121
1122/* Precomputed fixed inverse multiplies for multiplication by y^n */
1123static const u32 runnable_avg_yN_inv[] = {
1124 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 0x85aac367, 0x82cd8698,
1130};
1131
1132/*
1133 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1134 * over-estimates when re-combining.
1135 */
1136static const u32 runnable_avg_yN_sum[] = {
1137 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140};
1141
1142/*
1143 * Approximate:
1144 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1145 */
1146static __always_inline u64 decay_load(u64 val, u64 n)
966{ 1147{
1148 unsigned int local_n;
1149
1150 if (!n)
1151 return val;
1152 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 return 0;
1154
1155 /* after bounds checking we can collapse to 32-bit */
1156 local_n = n;
1157
1158 /*
1159 * As y^PERIOD = 1/2, we can combine
1160 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 * With a look-up table which covers k^n (n<PERIOD)
1162 *
1163 * To achieve constant time decay_load.
1164 */
1165 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 val >>= local_n / LOAD_AVG_PERIOD;
1167 local_n %= LOAD_AVG_PERIOD;
1168 }
1169
1170 val *= runnable_avg_yN_inv[local_n];
1171 /* We don't use SRR here since we always want to round down. */
1172 return val >> 32;
967} 1173}
968 1174
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) 1175/*
1176 * For updates fully spanning n periods, the contribution to runnable
1177 * average will be: \Sum 1024*y^n
1178 *
1179 * We can compute this reasonably efficiently by combining:
1180 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1181 */
1182static u32 __compute_runnable_contrib(u64 n)
970{ 1183{
1184 u32 contrib = 0;
1185
1186 if (likely(n <= LOAD_AVG_PERIOD))
1187 return runnable_avg_yN_sum[n];
1188 else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 return LOAD_AVG_MAX;
1190
1191 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 do {
1193 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195
1196 n -= LOAD_AVG_PERIOD;
1197 } while (n > LOAD_AVG_PERIOD);
1198
1199 contrib = decay_load(contrib, n);
1200 return contrib + runnable_avg_yN_sum[n];
971} 1201}
972#endif /* CONFIG_FAIR_GROUP_SCHED */ 1202
1203/*
1204 * We can represent the historical contribution to runnable average as the
1205 * coefficients of a geometric series. To do this we sub-divide our runnable
1206 * history into segments of approximately 1ms (1024us); label the segment that
1207 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208 *
1209 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210 * p0 p1 p2
1211 * (now) (~1ms ago) (~2ms ago)
1212 *
1213 * Let u_i denote the fraction of p_i that the entity was runnable.
1214 *
1215 * We then designate the fractions u_i as our co-efficients, yielding the
1216 * following representation of historical load:
1217 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218 *
1219 * We choose y based on the with of a reasonably scheduling period, fixing:
1220 * y^32 = 0.5
1221 *
1222 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223 * approximately half as much as the contribution to load within the last ms
1224 * (u_0).
1225 *
1226 * When a period "rolls over" and we have new u_0`, multiplying the previous
1227 * sum again by y is sufficient to update:
1228 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230 */
1231static __always_inline int __update_entity_runnable_avg(u64 now,
1232 struct sched_avg *sa,
1233 int runnable)
1234{
1235 u64 delta, periods;
1236 u32 runnable_contrib;
1237 int delta_w, decayed = 0;
1238
1239 delta = now - sa->last_runnable_update;
1240 /*
1241 * This should only happen when time goes backwards, which it
1242 * unfortunately does during sched clock init when we swap over to TSC.
1243 */
1244 if ((s64)delta < 0) {
1245 sa->last_runnable_update = now;
1246 return 0;
1247 }
1248
1249 /*
1250 * Use 1024ns as the unit of measurement since it's a reasonable
1251 * approximation of 1us and fast to compute.
1252 */
1253 delta >>= 10;
1254 if (!delta)
1255 return 0;
1256 sa->last_runnable_update = now;
1257
1258 /* delta_w is the amount already accumulated against our next period */
1259 delta_w = sa->runnable_avg_period % 1024;
1260 if (delta + delta_w >= 1024) {
1261 /* period roll-over */
1262 decayed = 1;
1263
1264 /*
1265 * Now that we know we're crossing a period boundary, figure
1266 * out how much from delta we need to complete the current
1267 * period and accrue it.
1268 */
1269 delta_w = 1024 - delta_w;
1270 if (runnable)
1271 sa->runnable_avg_sum += delta_w;
1272 sa->runnable_avg_period += delta_w;
1273
1274 delta -= delta_w;
1275
1276 /* Figure out how many additional periods this update spans */
1277 periods = delta / 1024;
1278 delta %= 1024;
1279
1280 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 periods + 1);
1282 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 periods + 1);
1284
1285 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 runnable_contrib = __compute_runnable_contrib(periods);
1287 if (runnable)
1288 sa->runnable_avg_sum += runnable_contrib;
1289 sa->runnable_avg_period += runnable_contrib;
1290 }
1291
1292 /* Remainder of delta accrued against u_0` */
1293 if (runnable)
1294 sa->runnable_avg_sum += delta;
1295 sa->runnable_avg_period += delta;
1296
1297 return decayed;
1298}
1299
1300/* Synchronize an entity's decay with its parenting cfs_rq.*/
1301static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1302{
1303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305
1306 decays -= se->avg.decay_count;
1307 if (!decays)
1308 return 0;
1309
1310 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 se->avg.decay_count = 0;
1312
1313 return decays;
1314}
1315
1316#ifdef CONFIG_FAIR_GROUP_SCHED
1317static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 int force_update)
1319{
1320 struct task_group *tg = cfs_rq->tg;
1321 s64 tg_contrib;
1322
1323 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 tg_contrib -= cfs_rq->tg_load_contrib;
1325
1326 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 atomic64_add(tg_contrib, &tg->load_avg);
1328 cfs_rq->tg_load_contrib += tg_contrib;
1329 }
1330}
1331
1332/*
1333 * Aggregate cfs_rq runnable averages into an equivalent task_group
1334 * representation for computing load contributions.
1335 */
1336static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 struct cfs_rq *cfs_rq)
1338{
1339 struct task_group *tg = cfs_rq->tg;
1340 long contrib;
1341
1342 /* The fraction of a cpu used by this cfs_rq */
1343 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 sa->runnable_avg_period + 1);
1345 contrib -= cfs_rq->tg_runnable_contrib;
1346
1347 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 atomic_add(contrib, &tg->runnable_avg);
1349 cfs_rq->tg_runnable_contrib += contrib;
1350 }
1351}
1352
1353static inline void __update_group_entity_contrib(struct sched_entity *se)
1354{
1355 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 struct task_group *tg = cfs_rq->tg;
1357 int runnable_avg;
1358
1359 u64 contrib;
1360
1361 contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 se->avg.load_avg_contrib = div64_u64(contrib,
1363 atomic64_read(&tg->load_avg) + 1);
1364
1365 /*
1366 * For group entities we need to compute a correction term in the case
1367 * that they are consuming <1 cpu so that we would contribute the same
1368 * load as a task of equal weight.
1369 *
1370 * Explicitly co-ordinating this measurement would be expensive, but
1371 * fortunately the sum of each cpus contribution forms a usable
1372 * lower-bound on the true value.
1373 *
1374 * Consider the aggregate of 2 contributions. Either they are disjoint
1375 * (and the sum represents true value) or they are disjoint and we are
1376 * understating by the aggregate of their overlap.
1377 *
1378 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 * cpus that overlap for this interval and w_i is the interval width.
1381 *
1382 * On a small machine; the first term is well-bounded which bounds the
1383 * total error since w_i is a subset of the period. Whereas on a
1384 * larger machine, while this first term can be larger, if w_i is the
1385 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 * our upper bound of 1-cpu.
1387 */
1388 runnable_avg = atomic_read(&tg->runnable_avg);
1389 if (runnable_avg < NICE_0_LOAD) {
1390 se->avg.load_avg_contrib *= runnable_avg;
1391 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 }
1393}
1394#else
1395static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 int force_update) {}
1397static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 struct cfs_rq *cfs_rq) {}
1399static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400#endif
1401
1402static inline void __update_task_entity_contrib(struct sched_entity *se)
1403{
1404 u32 contrib;
1405
1406 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 contrib /= (se->avg.runnable_avg_period + 1);
1409 se->avg.load_avg_contrib = scale_load(contrib);
1410}
1411
1412/* Compute the current contribution to load_avg by se, return any delta */
1413static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414{
1415 long old_contrib = se->avg.load_avg_contrib;
1416
1417 if (entity_is_task(se)) {
1418 __update_task_entity_contrib(se);
1419 } else {
1420 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 __update_group_entity_contrib(se);
1422 }
1423
1424 return se->avg.load_avg_contrib - old_contrib;
1425}
1426
1427static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 long load_contrib)
1429{
1430 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 cfs_rq->blocked_load_avg -= load_contrib;
1432 else
1433 cfs_rq->blocked_load_avg = 0;
1434}
1435
1436static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437
1438/* Update a sched_entity's runnable average */
1439static inline void update_entity_load_avg(struct sched_entity *se,
1440 int update_cfs_rq)
1441{
1442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 long contrib_delta;
1444 u64 now;
1445
1446 /*
1447 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 * case they are the parent of a throttled hierarchy.
1449 */
1450 if (entity_is_task(se))
1451 now = cfs_rq_clock_task(cfs_rq);
1452 else
1453 now = cfs_rq_clock_task(group_cfs_rq(se));
1454
1455 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 return;
1457
1458 contrib_delta = __update_entity_load_avg_contrib(se);
1459
1460 if (!update_cfs_rq)
1461 return;
1462
1463 if (se->on_rq)
1464 cfs_rq->runnable_load_avg += contrib_delta;
1465 else
1466 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467}
1468
1469/*
1470 * Decay the load contributed by all blocked children and account this so that
1471 * their contribution may appropriately discounted when they wake up.
1472 */
1473static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474{
1475 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 u64 decays;
1477
1478 decays = now - cfs_rq->last_decay;
1479 if (!decays && !force_update)
1480 return;
1481
1482 if (atomic64_read(&cfs_rq->removed_load)) {
1483 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 }
1486
1487 if (decays) {
1488 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 decays);
1490 atomic64_add(decays, &cfs_rq->decay_counter);
1491 cfs_rq->last_decay = now;
1492 }
1493
1494 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495}
1496
1497static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498{
1499 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501}
1502
1503/* Add the load generated by se into cfs_rq's child load-average */
1504static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 struct sched_entity *se,
1506 int wakeup)
1507{
1508 /*
1509 * We track migrations using entity decay_count <= 0, on a wake-up
1510 * migration we use a negative decay count to track the remote decays
1511 * accumulated while sleeping.
1512 */
1513 if (unlikely(se->avg.decay_count <= 0)) {
1514 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 if (se->avg.decay_count) {
1516 /*
1517 * In a wake-up migration we have to approximate the
1518 * time sleeping. This is because we can't synchronize
1519 * clock_task between the two cpus, and it is not
1520 * guaranteed to be read-safe. Instead, we can
1521 * approximate this using our carried decays, which are
1522 * explicitly atomically readable.
1523 */
1524 se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 << 20;
1526 update_entity_load_avg(se, 0);
1527 /* Indicate that we're now synchronized and on-rq */
1528 se->avg.decay_count = 0;
1529 }
1530 wakeup = 0;
1531 } else {
1532 __synchronize_entity_decay(se);
1533 }
1534
1535 /* migrated tasks did not contribute to our blocked load */
1536 if (wakeup) {
1537 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 update_entity_load_avg(se, 0);
1539 }
1540
1541 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 /* we force update consideration on load-balancer moves */
1543 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544}
1545
1546/*
1547 * Remove se's load from this cfs_rq child load-average, if the entity is
1548 * transitioning to a blocked state we track its projected decay using
1549 * blocked_load_avg.
1550 */
1551static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 struct sched_entity *se,
1553 int sleep)
1554{
1555 update_entity_load_avg(se, 1);
1556 /* we force update consideration on load-balancer moves */
1557 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558
1559 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 if (sleep) {
1561 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564}
1565#else
1566static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {}
1568static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int wakeup) {}
1572static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 struct sched_entity *se,
1574 int sleep) {}
1575static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 int force_update) {}
1577#endif
973 1578
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1579static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{ 1580{
@@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1096 * Update run-time statistics of the 'current'. 1701 * Update run-time statistics of the 'current'.
1097 */ 1702 */
1098 update_curr(cfs_rq); 1703 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0); 1704 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1100 account_entity_enqueue(cfs_rq, se); 1705 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq); 1706 update_cfs_shares(cfs_rq);
1102 1707
@@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1171 * Update run-time statistics of the 'current'. 1776 * Update run-time statistics of the 'current'.
1172 */ 1777 */
1173 update_curr(cfs_rq); 1778 update_curr(cfs_rq);
1779 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1174 1780
1175 update_stats_dequeue(cfs_rq, se); 1781 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) { 1782 if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1191 if (se != cfs_rq->curr) 1797 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se); 1798 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0; 1799 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se); 1800 account_entity_dequeue(cfs_rq, se);
1196 1801
1197 /* 1802 /*
@@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1340 update_stats_wait_start(cfs_rq, prev); 1945 update_stats_wait_start(cfs_rq, prev);
1341 /* Put 'current' back into the tree. */ 1946 /* Put 'current' back into the tree. */
1342 __enqueue_entity(cfs_rq, prev); 1947 __enqueue_entity(cfs_rq, prev);
1948 /* in !on_rq case, update occurred at dequeue */
1949 update_entity_load_avg(prev, 1);
1343 } 1950 }
1344 cfs_rq->curr = NULL; 1951 cfs_rq->curr = NULL;
1345} 1952}
@@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1353 update_curr(cfs_rq); 1960 update_curr(cfs_rq);
1354 1961
1355 /* 1962 /*
1356 * Update share accounting for long-running entities. 1963 * Ensure that runnable average is periodically updated.
1357 */ 1964 */
1358 update_entity_shares_tick(cfs_rq); 1965 update_entity_load_avg(curr, 1);
1966 update_cfs_rq_blocked_load(cfs_rq, 1);
1359 1967
1360#ifdef CONFIG_SCHED_HRTICK 1968#ifdef CONFIG_SCHED_HRTICK
1361 /* 1969 /*
@@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1448 return &tg->cfs_bandwidth; 2056 return &tg->cfs_bandwidth;
1449} 2057}
1450 2058
2059/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2060static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2061{
2062 if (unlikely(cfs_rq->throttle_count))
2063 return cfs_rq->throttled_clock_task;
2064
2065 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2066}
2067
1451/* returns 0 on failure to allocate runtime */ 2068/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2069static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{ 2070{
@@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
1592 cfs_rq->throttle_count--; 2209 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP 2210#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) { 2211 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp; 2212 /* adjust cfs_rq_clock_task() */
1596 2213 cfs_rq->throttled_clock_task_time += rq->clock_task -
1597 /* leaving throttled state, advance shares averaging windows */ 2214 cfs_rq->throttled_clock_task;
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601 /* update entity weight now that we are on_rq again */
1602 update_cfs_shares(cfs_rq);
1603 } 2215 }
1604#endif 2216#endif
1605 2217
@@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
1611 struct rq *rq = data; 2223 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 2224 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613 2225
1614 /* group is entering throttled state, record last load */ 2226 /* group is entering throttled state, stop time */
1615 if (!cfs_rq->throttle_count) 2227 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0); 2228 cfs_rq->throttled_clock_task = rq->clock_task;
1617 cfs_rq->throttle_count++; 2229 cfs_rq->throttle_count++;
1618 2230
1619 return 0; 2231 return 0;
@@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1628 2240
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2241 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630 2242
1631 /* account load preceding throttle */ 2243 /* freeze hierarchy runnable averages while throttled */
1632 rcu_read_lock(); 2244 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 2245 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock(); 2246 rcu_read_unlock();
@@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1652 rq->nr_running -= task_delta; 2264 rq->nr_running -= task_delta;
1653 2265
1654 cfs_rq->throttled = 1; 2266 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock; 2267 cfs_rq->throttled_clock = rq->clock;
1656 raw_spin_lock(&cfs_b->lock); 2268 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2269 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock); 2270 raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1670 2282
1671 cfs_rq->throttled = 0; 2283 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock); 2284 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; 2285 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
1674 list_del_rcu(&cfs_rq->throttled_list); 2286 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock); 2287 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677 2288
1678 update_rq_clock(rq); 2289 update_rq_clock(rq);
1679 /* update hierarchical throttle state */ 2290 /* update hierarchical throttle state */
@@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
2073} 2684}
2074 2685
2075#else /* CONFIG_CFS_BANDWIDTH */ 2686#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline 2687static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} 2688{
2689 return rq_of(cfs_rq)->clock_task;
2690}
2691
2692static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2693 unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2694static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2695static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2696static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2207 if (cfs_rq_throttled(cfs_rq)) 2823 if (cfs_rq_throttled(cfs_rq))
2208 break; 2824 break;
2209 2825
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq); 2826 update_cfs_shares(cfs_rq);
2827 update_entity_load_avg(se, 1);
2212 } 2828 }
2213 2829
2214 if (!se) 2830 if (!se) {
2831 update_rq_runnable_avg(rq, rq->nr_running);
2215 inc_nr_running(rq); 2832 inc_nr_running(rq);
2833 }
2216 hrtick_update(rq); 2834 hrtick_update(rq);
2217} 2835}
2218 2836
@@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2266 if (cfs_rq_throttled(cfs_rq)) 2884 if (cfs_rq_throttled(cfs_rq))
2267 break; 2885 break;
2268 2886
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq); 2887 update_cfs_shares(cfs_rq);
2888 update_entity_load_avg(se, 1);
2271 } 2889 }
2272 2890
2273 if (!se) 2891 if (!se) {
2274 dec_nr_running(rq); 2892 dec_nr_running(rq);
2893 update_rq_runnable_avg(rq, 1);
2894 }
2275 hrtick_update(rq); 2895 hrtick_update(rq);
2276} 2896}
2277 2897
@@ -2700,7 +3320,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2700 int prev_cpu = task_cpu(p); 3320 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu; 3321 int new_cpu = cpu;
2702 int want_affine = 0; 3322 int want_affine = 0;
2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 3323 int sync = wake_flags & WF_SYNC;
2705 3324
2706 if (p->nr_cpus_allowed == 1) 3325 if (p->nr_cpus_allowed == 1)
@@ -2718,48 +3337,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2718 continue; 3337 continue;
2719 3338
2720 /* 3339 /*
2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider.
2723 */
2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0;
2726 unsigned long nr_running = 0;
2727 unsigned long capacity;
2728 int i;
2729
2730 for_each_cpu(i, sched_domain_span(tmp)) {
2731 power += power_of(i);
2732 nr_running += cpu_rq(i)->cfs.nr_running;
2733 }
2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737 if (nr_running < capacity)
2738 want_sd = 0;
2739 }
2740
2741 /*
2742 * If both cpu and prev_cpu are part of this domain, 3340 * If both cpu and prev_cpu are part of this domain,
2743 * cpu is a valid SD_WAKE_AFFINE target. 3341 * cpu is a valid SD_WAKE_AFFINE target.
2744 */ 3342 */
2745 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 3343 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 3344 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747 affine_sd = tmp; 3345 affine_sd = tmp;
2748 want_affine = 0;
2749 }
2750
2751 if (!want_sd && !want_affine)
2752 break; 3346 break;
3347 }
2753 3348
2754 if (!(tmp->flags & sd_flag)) 3349 if (tmp->flags & sd_flag)
2755 continue;
2756
2757 if (want_sd)
2758 sd = tmp; 3350 sd = tmp;
2759 } 3351 }
2760 3352
2761 if (affine_sd) { 3353 if (affine_sd) {
2762 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 3354 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2763 prev_cpu = cpu; 3355 prev_cpu = cpu;
2764 3356
2765 new_cpu = select_idle_sibling(p, prev_cpu); 3357 new_cpu = select_idle_sibling(p, prev_cpu);
@@ -2809,6 +3401,37 @@ unlock:
2809 3401
2810 return new_cpu; 3402 return new_cpu;
2811} 3403}
3404
3405/*
3406 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3407 * removed when useful for applications beyond shares distribution (e.g.
3408 * load-balance).
3409 */
3410#ifdef CONFIG_FAIR_GROUP_SCHED
3411/*
3412 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3413 * cfs_rq_of(p) references at time of call are still valid and identify the
3414 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3415 * other assumptions, including the state of rq->lock, should be made.
3416 */
3417static void
3418migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3419{
3420 struct sched_entity *se = &p->se;
3421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3422
3423 /*
3424 * Load tracking: accumulate removed load so that it can be processed
3425 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3426 * to blocked load iff they have a positive decay-count. It can never
3427 * be negative here since on-rq tasks have decay-count == 0.
3428 */
3429 if (se->avg.decay_count) {
3430 se->avg.decay_count = -__synchronize_entity_decay(se);
3431 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3432 }
3433}
3434#endif
2812#endif /* CONFIG_SMP */ 3435#endif /* CONFIG_SMP */
2813 3436
2814static unsigned long 3437static unsigned long
@@ -2935,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2935 * Batch and idle tasks do not preempt non-idle tasks (their preemption 3558 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2936 * is driven by the tick): 3559 * is driven by the tick):
2937 */ 3560 */
2938 if (unlikely(p->policy != SCHED_NORMAL)) 3561 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
2939 return; 3562 return;
2940 3563
2941 find_matching_se(&se, &pse); 3564 find_matching_se(&se, &pse);
@@ -3061,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3061 3684
3062#ifdef CONFIG_SMP 3685#ifdef CONFIG_SMP
3063/************************************************** 3686/**************************************************
3064 * Fair scheduling class load-balancing methods: 3687 * Fair scheduling class load-balancing methods.
3065 */ 3688 *
3689 * BASICS
3690 *
3691 * The purpose of load-balancing is to achieve the same basic fairness the
3692 * per-cpu scheduler provides, namely provide a proportional amount of compute
3693 * time to each task. This is expressed in the following equation:
3694 *
3695 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3696 *
3697 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3698 * W_i,0 is defined as:
3699 *
3700 * W_i,0 = \Sum_j w_i,j (2)
3701 *
3702 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3703 * is derived from the nice value as per prio_to_weight[].
3704 *
3705 * The weight average is an exponential decay average of the instantaneous
3706 * weight:
3707 *
3708 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3709 *
3710 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3711 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3712 * can also include other factors [XXX].
3713 *
3714 * To achieve this balance we define a measure of imbalance which follows
3715 * directly from (1):
3716 *
3717 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3718 *
3719 * We them move tasks around to minimize the imbalance. In the continuous
3720 * function space it is obvious this converges, in the discrete case we get
3721 * a few fun cases generally called infeasible weight scenarios.
3722 *
3723 * [XXX expand on:
3724 * - infeasible weights;
3725 * - local vs global optima in the discrete case. ]
3726 *
3727 *
3728 * SCHED DOMAINS
3729 *
3730 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3731 * for all i,j solution, we create a tree of cpus that follows the hardware
3732 * topology where each level pairs two lower groups (or better). This results
3733 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3734 * tree to only the first of the previous level and we decrease the frequency
3735 * of load-balance at each level inv. proportional to the number of cpus in
3736 * the groups.
3737 *
3738 * This yields:
3739 *
3740 * log_2 n 1 n
3741 * \Sum { --- * --- * 2^i } = O(n) (5)
3742 * i = 0 2^i 2^i
3743 * `- size of each group
3744 * | | `- number of cpus doing load-balance
3745 * | `- freq
3746 * `- sum over all levels
3747 *
3748 * Coupled with a limit on how many tasks we can migrate every balance pass,
3749 * this makes (5) the runtime complexity of the balancer.
3750 *
3751 * An important property here is that each CPU is still (indirectly) connected
3752 * to every other cpu in at most O(log n) steps:
3753 *
3754 * The adjacency matrix of the resulting graph is given by:
3755 *
3756 * log_2 n
3757 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3758 * k = 0
3759 *
3760 * And you'll find that:
3761 *
3762 * A^(log_2 n)_i,j != 0 for all i,j (7)
3763 *
3764 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3765 * The task movement gives a factor of O(m), giving a convergence complexity
3766 * of:
3767 *
3768 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3769 *
3770 *
3771 * WORK CONSERVING
3772 *
3773 * In order to avoid CPUs going idle while there's still work to do, new idle
3774 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3775 * tree itself instead of relying on other CPUs to bring it work.
3776 *
3777 * This adds some complexity to both (5) and (8) but it reduces the total idle
3778 * time.
3779 *
3780 * [XXX more?]
3781 *
3782 *
3783 * CGROUPS
3784 *
3785 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3786 *
3787 * s_k,i
3788 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3789 * S_k
3790 *
3791 * Where
3792 *
3793 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3794 *
3795 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3796 *
3797 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3798 * property.
3799 *
3800 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3801 * rewrite all of this once again.]
3802 */
3066 3803
3067static unsigned long __read_mostly max_load_balance_interval = HZ/10; 3804static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3805
@@ -3328,52 +4065,58 @@ next:
3328/* 4065/*
3329 * update tg->load_weight by folding this cpu's load_avg 4066 * update tg->load_weight by folding this cpu's load_avg
3330 */ 4067 */
3331static int update_shares_cpu(struct task_group *tg, int cpu) 4068static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3332{ 4069{
3333 struct cfs_rq *cfs_rq; 4070 struct sched_entity *se = tg->se[cpu];
3334 unsigned long flags; 4071 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3335 struct rq *rq;
3336
3337 if (!tg->se[cpu])
3338 return 0;
3339
3340 rq = cpu_rq(cpu);
3341 cfs_rq = tg->cfs_rq[cpu];
3342
3343 raw_spin_lock_irqsave(&rq->lock, flags);
3344
3345 update_rq_clock(rq);
3346 update_cfs_load(cfs_rq, 1);
3347 4072
3348 /* 4073 /* throttled entities do not contribute to load */
3349 * We need to update shares after updating tg->load_weight in 4074 if (throttled_hierarchy(cfs_rq))
3350 * order to adjust the weight of groups with long running tasks. 4075 return;
3351 */
3352 update_cfs_shares(cfs_rq);
3353 4076
3354 raw_spin_unlock_irqrestore(&rq->lock, flags); 4077 update_cfs_rq_blocked_load(cfs_rq, 1);
3355 4078
3356 return 0; 4079 if (se) {
4080 update_entity_load_avg(se, 1);
4081 /*
4082 * We pivot on our runnable average having decayed to zero for
4083 * list removal. This generally implies that all our children
4084 * have also been removed (modulo rounding error or bandwidth
4085 * control); however, such cases are rare and we can fix these
4086 * at enqueue.
4087 *
4088 * TODO: fix up out-of-order children on enqueue.
4089 */
4090 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4091 list_del_leaf_cfs_rq(cfs_rq);
4092 } else {
4093 struct rq *rq = rq_of(cfs_rq);
4094 update_rq_runnable_avg(rq, rq->nr_running);
4095 }
3357} 4096}
3358 4097
3359static void update_shares(int cpu) 4098static void update_blocked_averages(int cpu)
3360{ 4099{
3361 struct cfs_rq *cfs_rq;
3362 struct rq *rq = cpu_rq(cpu); 4100 struct rq *rq = cpu_rq(cpu);
4101 struct cfs_rq *cfs_rq;
4102 unsigned long flags;
3363 4103
3364 rcu_read_lock(); 4104 raw_spin_lock_irqsave(&rq->lock, flags);
4105 update_rq_clock(rq);
3365 /* 4106 /*
3366 * Iterates the task_group tree in a bottom up fashion, see 4107 * Iterates the task_group tree in a bottom up fashion, see
3367 * list_add_leaf_cfs_rq() for details. 4108 * list_add_leaf_cfs_rq() for details.
3368 */ 4109 */
3369 for_each_leaf_cfs_rq(rq, cfs_rq) { 4110 for_each_leaf_cfs_rq(rq, cfs_rq) {
3370 /* throttled entities do not contribute to load */ 4111 /*
3371 if (throttled_hierarchy(cfs_rq)) 4112 * Note: We may want to consider periodically releasing
3372 continue; 4113 * rq->lock about these updates so that creating many task
3373 4114 * groups does not result in continually extending hold time.
3374 update_shares_cpu(cfs_rq->tg, cpu); 4115 */
4116 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3375 } 4117 }
3376 rcu_read_unlock(); 4118
4119 raw_spin_unlock_irqrestore(&rq->lock, flags);
3377} 4120}
3378 4121
3379/* 4122/*
@@ -3425,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p)
3425 return load; 4168 return load;
3426} 4169}
3427#else 4170#else
3428static inline void update_shares(int cpu) 4171static inline void update_blocked_averages(int cpu)
3429{ 4172{
3430} 4173}
3431 4174
@@ -4295,7 +5038,7 @@ redo:
4295 goto out_balanced; 5038 goto out_balanced;
4296 } 5039 }
4297 5040
4298 BUG_ON(busiest == this_rq); 5041 BUG_ON(busiest == env.dst_rq);
4299 5042
4300 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5043 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4301 5044
@@ -4316,7 +5059,7 @@ redo:
4316 update_h_load(env.src_cpu); 5059 update_h_load(env.src_cpu);
4317more_balance: 5060more_balance:
4318 local_irq_save(flags); 5061 local_irq_save(flags);
4319 double_rq_lock(this_rq, busiest); 5062 double_rq_lock(env.dst_rq, busiest);
4320 5063
4321 /* 5064 /*
4322 * cur_ld_moved - load moved in current iteration 5065 * cur_ld_moved - load moved in current iteration
@@ -4324,7 +5067,7 @@ more_balance:
4324 */ 5067 */
4325 cur_ld_moved = move_tasks(&env); 5068 cur_ld_moved = move_tasks(&env);
4326 ld_moved += cur_ld_moved; 5069 ld_moved += cur_ld_moved;
4327 double_rq_unlock(this_rq, busiest); 5070 double_rq_unlock(env.dst_rq, busiest);
4328 local_irq_restore(flags); 5071 local_irq_restore(flags);
4329 5072
4330 if (env.flags & LBF_NEED_BREAK) { 5073 if (env.flags & LBF_NEED_BREAK) {
@@ -4360,8 +5103,7 @@ more_balance:
4360 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5103 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4361 lb_iterations++ < max_lb_iterations) { 5104 lb_iterations++ < max_lb_iterations) {
4362 5105
4363 this_rq = cpu_rq(env.new_dst_cpu); 5106 env.dst_rq = cpu_rq(env.new_dst_cpu);
4364 env.dst_rq = this_rq;
4365 env.dst_cpu = env.new_dst_cpu; 5107 env.dst_cpu = env.new_dst_cpu;
4366 env.flags &= ~LBF_SOME_PINNED; 5108 env.flags &= ~LBF_SOME_PINNED;
4367 env.loop = 0; 5109 env.loop = 0;
@@ -4486,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
4486 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5228 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4487 return; 5229 return;
4488 5230
5231 update_rq_runnable_avg(this_rq, 1);
5232
4489 /* 5233 /*
4490 * Drop the rq->lock, but keep IRQ/preempt disabled. 5234 * Drop the rq->lock, but keep IRQ/preempt disabled.
4491 */ 5235 */
4492 raw_spin_unlock(&this_rq->lock); 5236 raw_spin_unlock(&this_rq->lock);
4493 5237
4494 update_shares(this_cpu); 5238 update_blocked_averages(this_cpu);
4495 rcu_read_lock(); 5239 rcu_read_lock();
4496 for_each_domain(this_cpu, sd) { 5240 for_each_domain(this_cpu, sd) {
4497 unsigned long interval; 5241 unsigned long interval;
@@ -4646,7 +5390,7 @@ static void nohz_balancer_kick(int cpu)
4646 return; 5390 return;
4647} 5391}
4648 5392
4649static inline void clear_nohz_tick_stopped(int cpu) 5393static inline void nohz_balance_exit_idle(int cpu)
4650{ 5394{
4651 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 5395 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4652 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 5396 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
@@ -4686,28 +5430,23 @@ void set_cpu_sd_state_idle(void)
4686} 5430}
4687 5431
4688/* 5432/*
4689 * This routine will record that this cpu is going idle with tick stopped. 5433 * This routine will record that the cpu is going idle with tick stopped.
4690 * This info will be used in performing idle load balancing in the future. 5434 * This info will be used in performing idle load balancing in the future.
4691 */ 5435 */
4692void select_nohz_load_balancer(int stop_tick) 5436void nohz_balance_enter_idle(int cpu)
4693{ 5437{
4694 int cpu = smp_processor_id();
4695
4696 /* 5438 /*
4697 * If this cpu is going down, then nothing needs to be done. 5439 * If this cpu is going down, then nothing needs to be done.
4698 */ 5440 */
4699 if (!cpu_active(cpu)) 5441 if (!cpu_active(cpu))
4700 return; 5442 return;
4701 5443
4702 if (stop_tick) { 5444 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4703 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 5445 return;
4704 return;
4705 5446
4706 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 5447 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4707 atomic_inc(&nohz.nr_cpus); 5448 atomic_inc(&nohz.nr_cpus);
4708 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5449 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4709 }
4710 return;
4711} 5450}
4712 5451
4713static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 5452static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
@@ -4715,7 +5454,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4715{ 5454{
4716 switch (action & ~CPU_TASKS_FROZEN) { 5455 switch (action & ~CPU_TASKS_FROZEN) {
4717 case CPU_DYING: 5456 case CPU_DYING:
4718 clear_nohz_tick_stopped(smp_processor_id()); 5457 nohz_balance_exit_idle(smp_processor_id());
4719 return NOTIFY_OK; 5458 return NOTIFY_OK;
4720 default: 5459 default:
4721 return NOTIFY_DONE; 5460 return NOTIFY_DONE;
@@ -4751,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4751 int update_next_balance = 0; 5490 int update_next_balance = 0;
4752 int need_serialize; 5491 int need_serialize;
4753 5492
4754 update_shares(cpu); 5493 update_blocked_averages(cpu);
4755 5494
4756 rcu_read_lock(); 5495 rcu_read_lock();
4757 for_each_domain(cpu, sd) { 5496 for_each_domain(cpu, sd) {
@@ -4837,14 +5576,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4837 if (need_resched()) 5576 if (need_resched())
4838 break; 5577 break;
4839 5578
4840 raw_spin_lock_irq(&this_rq->lock); 5579 rq = cpu_rq(balance_cpu);
4841 update_rq_clock(this_rq); 5580
4842 update_idle_cpu_load(this_rq); 5581 raw_spin_lock_irq(&rq->lock);
4843 raw_spin_unlock_irq(&this_rq->lock); 5582 update_rq_clock(rq);
5583 update_idle_cpu_load(rq);
5584 raw_spin_unlock_irq(&rq->lock);
4844 5585
4845 rebalance_domains(balance_cpu, CPU_IDLE); 5586 rebalance_domains(balance_cpu, CPU_IDLE);
4846 5587
4847 rq = cpu_rq(balance_cpu);
4848 if (time_after(this_rq->next_balance, rq->next_balance)) 5588 if (time_after(this_rq->next_balance, rq->next_balance))
4849 this_rq->next_balance = rq->next_balance; 5589 this_rq->next_balance = rq->next_balance;
4850 } 5590 }
@@ -4875,7 +5615,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4875 * busy tick after returning from idle, we will update the busy stats. 5615 * busy tick after returning from idle, we will update the busy stats.
4876 */ 5616 */
4877 set_cpu_sd_state_busy(); 5617 set_cpu_sd_state_busy();
4878 clear_nohz_tick_stopped(cpu); 5618 nohz_balance_exit_idle(cpu);
4879 5619
4880 /* 5620 /*
4881 * None are in tickless mode and hence no need for NOHZ idle load 5621 * None are in tickless mode and hence no need for NOHZ idle load
@@ -4987,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4987 cfs_rq = cfs_rq_of(se); 5727 cfs_rq = cfs_rq_of(se);
4988 entity_tick(cfs_rq, se, queued); 5728 entity_tick(cfs_rq, se, queued);
4989 } 5729 }
5730
5731 if (sched_feat_numa(NUMA))
5732 task_tick_numa(rq, curr);
5733
5734 update_rq_runnable_avg(rq, 1);
4990} 5735}
4991 5736
4992/* 5737/*
@@ -5079,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5079 place_entity(cfs_rq, se, 0); 5824 place_entity(cfs_rq, se, 0);
5080 se->vruntime -= cfs_rq->min_vruntime; 5825 se->vruntime -= cfs_rq->min_vruntime;
5081 } 5826 }
5827
5828#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5829 /*
5830 * Remove our load from contribution when we leave sched_fair
5831 * and ensure we don't carry in an old decay_count if we
5832 * switch back.
5833 */
5834 if (p->se.avg.decay_count) {
5835 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5836 __synchronize_entity_decay(&p->se);
5837 subtract_blocked_load_contrib(cfs_rq,
5838 p->se.avg.load_avg_contrib);
5839 }
5840#endif
5082} 5841}
5083 5842
5084/* 5843/*
@@ -5125,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5125#ifndef CONFIG_64BIT 5884#ifndef CONFIG_64BIT
5126 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5885 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5127#endif 5886#endif
5887#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5888 atomic64_set(&cfs_rq->decay_counter, 1);
5889 atomic64_set(&cfs_rq->removed_load, 0);
5890#endif
5128} 5891}
5129 5892
5130#ifdef CONFIG_FAIR_GROUP_SCHED 5893#ifdef CONFIG_FAIR_GROUP_SCHED
5131static void task_move_group_fair(struct task_struct *p, int on_rq) 5894static void task_move_group_fair(struct task_struct *p, int on_rq)
5132{ 5895{
5896 struct cfs_rq *cfs_rq;
5133 /* 5897 /*
5134 * If the task was not on the rq at the time of this cgroup movement 5898 * If the task was not on the rq at the time of this cgroup movement
5135 * it must have been asleep, sleeping tasks keep their ->vruntime 5899 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5161,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5161 if (!on_rq) 5925 if (!on_rq)
5162 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5926 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5163 set_task_rq(p, task_cpu(p)); 5927 set_task_rq(p, task_cpu(p));
5164 if (!on_rq) 5928 if (!on_rq) {
5165 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5929 cfs_rq = cfs_rq_of(&p->se);
5930 p->se.vruntime += cfs_rq->min_vruntime;
5931#ifdef CONFIG_SMP
5932 /*
5933 * migrate_task_rq_fair() will have removed our previous
5934 * contribution, but we must synchronize for ongoing future
5935 * decay.
5936 */
5937 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5938 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5939#endif
5940 }
5166} 5941}
5167 5942
5168void free_fair_sched_group(struct task_group *tg) 5943void free_fair_sched_group(struct task_group *tg)
@@ -5247,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5247 6022
5248 cfs_rq->tg = tg; 6023 cfs_rq->tg = tg;
5249 cfs_rq->rq = rq; 6024 cfs_rq->rq = rq;
5250#ifdef CONFIG_SMP
5251 /* allow initial update_cfs_load() to truncate */
5252 cfs_rq->load_stamp = 1;
5253#endif
5254 init_cfs_rq_runtime(cfs_rq); 6025 init_cfs_rq_runtime(cfs_rq);
5255 6026
5256 tg->cfs_rq[cpu] = cfs_rq; 6027 tg->cfs_rq[cpu] = cfs_rq;
@@ -5352,7 +6123,9 @@ const struct sched_class fair_sched_class = {
5352 6123
5353#ifdef CONFIG_SMP 6124#ifdef CONFIG_SMP
5354 .select_task_rq = select_task_rq_fair, 6125 .select_task_rq = select_task_rq_fair,
5355 6126#ifdef CONFIG_FAIR_GROUP_SCHED
6127 .migrate_task_rq = migrate_task_rq_fair,
6128#endif
5356 .rq_online = rq_online_fair, 6129 .rq_online = rq_online_fair,
5357 .rq_offline = rq_offline_fair, 6130 .rq_offline = rq_offline_fair,
5358 6131
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c6..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
12SCHED_FEAT(START_DEBIT, true) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place
16 * a newly woken task on the same cpu as the task that woke it --
17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */
20SCHED_FEAT(AFFINE_WAKEUPS, true)
21
22/*
23 * Prefer to schedule the task we woke last (assuming it failed 15 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 16 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 17 * touched, increases cache locality.
@@ -40,9 +32,14 @@ SCHED_FEAT(LAST_BUDDY, true)
40SCHED_FEAT(CACHE_HOT_BUDDY, true) 32SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 33
42/* 34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
43 * Use arch dependent cpu power functions 40 * Use arch dependent cpu power functions
44 */ 41 */
45SCHED_FEAT(ARCH_POWER, false) 42SCHED_FEAT(ARCH_POWER, true)
46 43
47SCHED_FEAT(HRTICK, false) 44SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, false) 45SCHED_FEAT(DOUBLE_TICK, false)
@@ -69,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
69SCHED_FEAT(FORCE_SD_OVERLAP, false) 66SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true) 67SCHED_FEAT(RT_RUNTIME_SHARE, true)
71SCHED_FEAT(LB_MIN, false) 68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e0b7ba9c040f..418feb01344e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
1632 if (!next_task) 1632 if (!next_task)
1633 return 0; 1633 return 0;
1634 1634
1635#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1636 if (unlikely(task_running(rq, next_task)))
1637 return 0;
1638#endif
1639
1640retry: 1635retry:
1641 if (unlikely(next_task == rq->curr)) { 1636 if (unlikely(next_task == rq->curr)) {
1642 WARN_ON(1); 1637 WARN_ON(1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0848fa36c383..fc886441436a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
112 unsigned long shares; 112 unsigned long shares;
113 113
114 atomic_t load_weight; 114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
115#endif 117#endif
116 118
117#ifdef CONFIG_RT_GROUP_SCHED 119#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
222 unsigned int nr_spread_over; 224 unsigned int nr_spread_over;
223#endif 225#endif
224 226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
225#ifdef CONFIG_FAIR_GROUP_SCHED 233#ifdef CONFIG_FAIR_GROUP_SCHED
226 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
227
228 /* 234 /*
229 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 235 * CFS Load tracking
230 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
231 * (like users, containers etc.) 237 * This allows for the description of both thread and group usage (in
232 * 238 * the FAIR_GROUP_SCHED case).
233 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
234 * list is used during load balance.
235 */ 239 */
236 int on_list; 240 u64 runnable_load_avg, blocked_load_avg;
237 struct list_head leaf_cfs_rq_list; 241 atomic64_t decay_counter, removed_load;
238 struct task_group *tg; /* group that "owns" this runqueue */ 242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
239 249
240#ifdef CONFIG_SMP
241 /* 250 /*
242 * h_load = weight * f(tg) 251 * h_load = weight * f(tg)
243 * 252 *
@@ -245,26 +254,30 @@ struct cfs_rq {
245 * this group. 254 * this group.
246 */ 255 */
247 unsigned long h_load; 256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
248 261
249 /* 262 /*
250 * Maintaining per-cpu shares distribution for group scheduling 263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
251 * 266 *
252 * load_stamp is the last time we updated the load average 267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
253 * load_last is the last time we updated the load average and saw load 268 * list is used during load balance.
254 * load_unacc_exec_time is currently unaccounted execution time
255 */ 269 */
256 u64 load_avg; 270 int on_list;
257 u64 load_period; 271 struct list_head leaf_cfs_rq_list;
258 u64 load_stamp, load_last, load_unacc_exec_time; 272 struct task_group *tg; /* group that "owns" this runqueue */
259 273
260 unsigned long load_contribution;
261#endif /* CONFIG_SMP */
262#ifdef CONFIG_CFS_BANDWIDTH 274#ifdef CONFIG_CFS_BANDWIDTH
263 int runtime_enabled; 275 int runtime_enabled;
264 u64 runtime_expires; 276 u64 runtime_expires;
265 s64 runtime_remaining; 277 s64 runtime_remaining;
266 278
267 u64 throttled_timestamp; 279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
268 int throttled, throttle_count; 281 int throttled, throttle_count;
269 struct list_head throttled_list; 282 struct list_head throttled_list;
270#endif /* CONFIG_CFS_BANDWIDTH */ 283#endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +480,8 @@ struct rq {
467#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
468 struct llist_head wake_list; 481 struct llist_head wake_list;
469#endif 482#endif
483
484 struct sched_avg avg;
470}; 485};
471 486
472static inline int cpu_of(struct rq *rq) 487static inline int cpu_of(struct rq *rq)
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
651static inline u64 global_rt_period(void) 678static inline u64 global_rt_period(void)
652{ 679{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -737,11 +764,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
737 */ 764 */
738 next->on_cpu = 1; 765 next->on_cpu = 1;
739#endif 766#endif
740#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
741 raw_spin_unlock_irq(&rq->lock);
742#else
743 raw_spin_unlock(&rq->lock); 767 raw_spin_unlock(&rq->lock);
744#endif
745} 768}
746 769
747static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 770static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -755,9 +778,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
755 smp_wmb(); 778 smp_wmb();
756 prev->on_cpu = 0; 779 prev->on_cpu = 0;
757#endif 780#endif
758#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
759 local_irq_enable(); 781 local_irq_enable();
760#endif
761} 782}
762#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 783#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
763 784
@@ -891,6 +912,9 @@ struct cpuacct {
891 struct kernel_cpustat __percpu *cpustat; 912 struct kernel_cpustat __percpu *cpustat;
892}; 913};
893 914
915extern struct cgroup_subsys cpuacct_subsys;
916extern struct cpuacct root_cpuacct;
917
894/* return cpu accounting group corresponding to this container */ 918/* return cpu accounting group corresponding to this container */
895static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 919static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
896{ 920{
@@ -917,6 +941,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
917static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 941static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
918#endif 942#endif
919 943
944#ifdef CONFIG_PARAVIRT
945static inline u64 steal_ticks(u64 steal)
946{
947 if (unlikely(steal > NSEC_PER_SEC))
948 return div_u64(steal, TICK_NSEC);
949
950 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
951}
952#endif
953
920static inline void inc_nr_running(struct rq *rq) 954static inline void inc_nr_running(struct rq *rq)
921{ 955{
922 rq->nr_running++; 956 rq->nr_running++;
@@ -1156,3 +1190,52 @@ enum rq_nohz_flag_bits {
1156 1190
1157#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1191#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1158#endif 1192#endif
1193
1194#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1195
1196DECLARE_PER_CPU(u64, cpu_hardirq_time);
1197DECLARE_PER_CPU(u64, cpu_softirq_time);
1198
1199#ifndef CONFIG_64BIT
1200DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1201
1202static inline void irq_time_write_begin(void)
1203{
1204 __this_cpu_inc(irq_time_seq.sequence);
1205 smp_wmb();
1206}
1207
1208static inline void irq_time_write_end(void)
1209{
1210 smp_wmb();
1211 __this_cpu_inc(irq_time_seq.sequence);
1212}
1213
1214static inline u64 irq_time_read(int cpu)
1215{
1216 u64 irq_time;
1217 unsigned seq;
1218
1219 do {
1220 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1221 irq_time = per_cpu(cpu_softirq_time, cpu) +
1222 per_cpu(cpu_hardirq_time, cpu);
1223 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1224
1225 return irq_time;
1226}
1227#else /* CONFIG_64BIT */
1228static inline void irq_time_write_begin(void)
1229{
1230}
1231
1232static inline void irq_time_write_end(void)
1233{
1234}
1235
1236static inline u64 irq_time_read(int cpu)
1237{
1238 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1239}
1240#endif /* CONFIG_64BIT */
1241#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
396#ifdef CONFIG_SECCOMP_FILTER 396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: { 397 case SECCOMP_MODE_FILTER: {
398 int data; 398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
399 ret = seccomp_run_filters(this_syscall); 400 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA; 401 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION; 402 ret &= SECCOMP_RET_ACTION;
402 switch (ret) { 403 switch (ret) {
403 case SECCOMP_RET_ERRNO: 404 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */ 405 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current), 406 syscall_set_return_value(current, regs,
406 -data, 0); 407 -data, 0);
407 goto skip; 408 goto skip;
408 case SECCOMP_RET_TRAP: 409 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */ 410 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current)); 411 syscall_rollback(current, regs);
411 /* Let the filter pass back 16 bits of data. */ 412 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data); 413 seccomp_send_sigsys(this_syscall, data);
413 goto skip; 414 goto skip;
414 case SECCOMP_RET_TRACE: 415 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */ 416 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) 417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
417 goto skip; 420 goto skip;
421 }
418 /* Allow the BPF to provide the event message */ 422 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /* 424 /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
425 */ 429 */
426 if (fatal_signal_pending(current)) 430 if (fatal_signal_pending(current))
427 break; 431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
428 return 0; 435 return 0;
429 case SECCOMP_RET_ALLOW: 436 case SECCOMP_RET_ALLOW:
430 return 0; 437 return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index be4f856d52f8..7aaa51d8e5b8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/tty.h> 18#include <linux/tty.h>
19#include <linux/binfmts.h> 19#include <linux/binfmts.h>
20#include <linux/coredump.h>
20#include <linux/security.h> 21#include <linux/security.h>
21#include <linux/syscalls.h> 22#include <linux/syscalls.h>
22#include <linux/ptrace.h> 23#include <linux/ptrace.h>
@@ -30,6 +31,7 @@
30#include <linux/nsproxy.h> 31#include <linux/nsproxy.h>
31#include <linux/user_namespace.h> 32#include <linux/user_namespace.h>
32#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h>
33#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
34#include <trace/events/signal.h> 36#include <trace/events/signal.h>
35 37
@@ -1158,8 +1160,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1158 return __send_signal(sig, info, t, group, from_ancestor_ns); 1160 return __send_signal(sig, info, t, group, from_ancestor_ns);
1159} 1161}
1160 1162
1161static void print_fatal_signal(struct pt_regs *regs, int signr) 1163static void print_fatal_signal(int signr)
1162{ 1164{
1165 struct pt_regs *regs = signal_pt_regs();
1163 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1166 printk("%s/%d: potentially unexpected fatal signal %d.\n",
1164 current->comm, task_pid_nr(current), signr); 1167 current->comm, task_pid_nr(current), signr);
1165 1168
@@ -1751,7 +1754,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1751 * see comment in do_notify_parent() about the following 4 lines 1754 * see comment in do_notify_parent() about the following 4 lines
1752 */ 1755 */
1753 rcu_read_lock(); 1756 rcu_read_lock();
1754 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1757 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
1755 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1756 rcu_read_unlock(); 1759 rcu_read_unlock();
1757 1760
@@ -1907,7 +1910,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1907 preempt_disable(); 1910 preempt_disable();
1908 read_unlock(&tasklist_lock); 1911 read_unlock(&tasklist_lock);
1909 preempt_enable_no_resched(); 1912 preempt_enable_no_resched();
1910 schedule(); 1913 freezable_schedule();
1911 } else { 1914 } else {
1912 /* 1915 /*
1913 * By the time we got the lock, our tracer went away. 1916 * By the time we got the lock, our tracer went away.
@@ -1929,13 +1932,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1929 } 1932 }
1930 1933
1931 /* 1934 /*
1932 * While in TASK_TRACED, we were considered "frozen enough".
1933 * Now that we woke up, it's crucial if we're supposed to be
1934 * frozen that we freeze now before running anything substantial.
1935 */
1936 try_to_freeze();
1937
1938 /*
1939 * We are back. Now reacquire the siglock before touching 1935 * We are back. Now reacquire the siglock before touching
1940 * last_siginfo, so that we are sure to have synchronized with 1936 * last_siginfo, so that we are sure to have synchronized with
1941 * any signal-sending on another CPU that wants to examine it. 1937 * any signal-sending on another CPU that wants to examine it.
@@ -1971,13 +1967,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1971void ptrace_notify(int exit_code) 1967void ptrace_notify(int exit_code)
1972{ 1968{
1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); 1969 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1974 if (unlikely(current->task_works)) { 1970 if (unlikely(current->task_works))
1975 if (test_and_clear_ti_thread_flag(current_thread_info(), 1971 task_work_run();
1976 TIF_NOTIFY_RESUME)) {
1977 smp_mb__after_clear_bit();
1978 task_work_run();
1979 }
1980 }
1981 1972
1982 spin_lock_irq(&current->sighand->siglock); 1973 spin_lock_irq(&current->sighand->siglock);
1983 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); 1974 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2096,7 +2087,7 @@ static bool do_signal_stop(int signr)
2096 } 2087 }
2097 2088
2098 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2089 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2099 schedule(); 2090 freezable_schedule();
2100 return true; 2091 return true;
2101 } else { 2092 } else {
2102 /* 2093 /*
@@ -2142,10 +2133,9 @@ static void do_jobctl_trap(void)
2142 } 2133 }
2143} 2134}
2144 2135
2145static int ptrace_signal(int signr, siginfo_t *info, 2136static int ptrace_signal(int signr, siginfo_t *info)
2146 struct pt_regs *regs, void *cookie)
2147{ 2137{
2148 ptrace_signal_deliver(regs, cookie); 2138 ptrace_signal_deliver();
2149 /* 2139 /*
2150 * We do not check sig_kernel_stop(signr) but set this marker 2140 * We do not check sig_kernel_stop(signr) but set this marker
2151 * unconditionally because we do not know whether debugger will 2141 * unconditionally because we do not know whether debugger will
@@ -2198,26 +2188,20 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2198 struct signal_struct *signal = current->signal; 2188 struct signal_struct *signal = current->signal;
2199 int signr; 2189 int signr;
2200 2190
2201 if (unlikely(current->task_works)) { 2191 if (unlikely(current->task_works))
2202 if (test_and_clear_ti_thread_flag(current_thread_info(), 2192 task_work_run();
2203 TIF_NOTIFY_RESUME)) {
2204 smp_mb__after_clear_bit();
2205 task_work_run();
2206 }
2207 }
2208 2193
2209 if (unlikely(uprobe_deny_signal())) 2194 if (unlikely(uprobe_deny_signal()))
2210 return 0; 2195 return 0;
2211 2196
2212relock:
2213 /* 2197 /*
2214 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2198 * Do this once, we can't return to user-mode if freezing() == T.
2215 * While in TASK_STOPPED, we were considered "frozen enough". 2199 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2216 * Now that we woke up, it's crucial if we're supposed to be 2200 * thus do not need another check after return.
2217 * frozen that we freeze now before running anything substantial.
2218 */ 2201 */
2219 try_to_freeze(); 2202 try_to_freeze();
2220 2203
2204relock:
2221 spin_lock_irq(&sighand->siglock); 2205 spin_lock_irq(&sighand->siglock);
2222 /* 2206 /*
2223 * Every stopped thread goes here after wakeup. Check to see if 2207 * Every stopped thread goes here after wakeup. Check to see if
@@ -2274,8 +2258,7 @@ relock:
2274 break; /* will return 0 */ 2258 break; /* will return 0 */
2275 2259
2276 if (unlikely(current->ptrace) && signr != SIGKILL) { 2260 if (unlikely(current->ptrace) && signr != SIGKILL) {
2277 signr = ptrace_signal(signr, info, 2261 signr = ptrace_signal(signr, info);
2278 regs, cookie);
2279 if (!signr) 2262 if (!signr)
2280 continue; 2263 continue;
2281 } 2264 }
@@ -2360,7 +2343,7 @@ relock:
2360 2343
2361 if (sig_kernel_coredump(signr)) { 2344 if (sig_kernel_coredump(signr)) {
2362 if (print_fatal_signals) 2345 if (print_fatal_signals)
2363 print_fatal_signal(regs, info->si_signo); 2346 print_fatal_signal(info->si_signo);
2364 /* 2347 /*
2365 * If it was able to dump core, this kills all 2348 * If it was able to dump core, this kills all
2366 * other threads in the group and synchronizes with 2349 * other threads in the group and synchronizes with
@@ -2369,7 +2352,7 @@ relock:
2369 * first and our do_group_exit call below will use 2352 * first and our do_group_exit call below will use
2370 * that value and ignore the one we pass it. 2353 * that value and ignore the one we pass it.
2371 */ 2354 */
2372 do_coredump(info->si_signo, info->si_signo, regs); 2355 do_coredump(info);
2373 } 2356 }
2374 2357
2375 /* 2358 /*
@@ -3112,6 +3095,79 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3112out: 3095out:
3113 return error; 3096 return error;
3114} 3097}
3098#ifdef CONFIG_GENERIC_SIGALTSTACK
3099SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3100{
3101 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3102}
3103#endif
3104
3105int restore_altstack(const stack_t __user *uss)
3106{
3107 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
3108 /* squash all but EFAULT for now */
3109 return err == -EFAULT ? err : 0;
3110}
3111
3112int __save_altstack(stack_t __user *uss, unsigned long sp)
3113{
3114 struct task_struct *t = current;
3115 return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
3116 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3117 __put_user(t->sas_ss_size, &uss->ss_size);
3118}
3119
3120#ifdef CONFIG_COMPAT
3121#ifdef CONFIG_GENERIC_SIGALTSTACK
3122asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
3123 compat_stack_t __user *uoss_ptr)
3124{
3125 stack_t uss, uoss;
3126 int ret;
3127 mm_segment_t seg;
3128
3129 if (uss_ptr) {
3130 compat_stack_t uss32;
3131
3132 memset(&uss, 0, sizeof(stack_t));
3133 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3134 return -EFAULT;
3135 uss.ss_sp = compat_ptr(uss32.ss_sp);
3136 uss.ss_flags = uss32.ss_flags;
3137 uss.ss_size = uss32.ss_size;
3138 }
3139 seg = get_fs();
3140 set_fs(KERNEL_DS);
3141 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3142 (stack_t __force __user *) &uoss,
3143 compat_user_stack_pointer());
3144 set_fs(seg);
3145 if (ret >= 0 && uoss_ptr) {
3146 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
3147 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
3148 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
3149 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
3150 ret = -EFAULT;
3151 }
3152 return ret;
3153}
3154
3155int compat_restore_altstack(const compat_stack_t __user *uss)
3156{
3157 int err = compat_sys_sigaltstack(uss, NULL);
3158 /* squash all but -EFAULT for now */
3159 return err == -EFAULT ? err : 0;
3160}
3161
3162int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3163{
3164 struct task_struct *t = current;
3165 return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
3166 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3167 __put_user(t->sas_ss_size, &uss->ss_size);
3168}
3169#endif
3170#endif
3115 3171
3116#ifdef __ARCH_WANT_SYS_SIGPENDING 3172#ifdef __ARCH_WANT_SYS_SIGPENDING
3117 3173
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1b..d6c5fc054242 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
1/* 1/*
2 * Common SMP CPU bringup/teardown functions 2 * Common SMP CPU bringup/teardown functions
3 */ 3 */
4#include <linux/cpu.h>
4#include <linux/err.h> 5#include <linux/err.h>
5#include <linux/smp.h> 6#include <linux/smp.h>
6#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/list.h>
9#include <linux/slab.h>
7#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/export.h>
8#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/kthread.h>
14#include <linux/smpboot.h>
9 15
10#include "smpboot.h" 16#include "smpboot.h"
11 17
18#ifdef CONFIG_SMP
19
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 20#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/* 21/*
14 * For the hotplug case we keep the task structs around and reuse 22 * For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
65 } 73 }
66} 74}
67#endif 75#endif
76
77#endif /* #ifdef CONFIG_SMP */
78
79static LIST_HEAD(hotplug_threads);
80static DEFINE_MUTEX(smpboot_threads_lock);
81
82struct smpboot_thread_data {
83 unsigned int cpu;
84 unsigned int status;
85 struct smp_hotplug_thread *ht;
86};
87
88enum {
89 HP_THREAD_NONE = 0,
90 HP_THREAD_ACTIVE,
91 HP_THREAD_PARKED,
92};
93
94/**
95 * smpboot_thread_fn - percpu hotplug thread loop function
96 * @data: thread data pointer
97 *
98 * Checks for thread stop and park conditions. Calls the necessary
99 * setup, cleanup, park and unpark functions for the registered
100 * thread.
101 *
102 * Returns 1 when the thread should exit, 0 otherwise.
103 */
104static int smpboot_thread_fn(void *data)
105{
106 struct smpboot_thread_data *td = data;
107 struct smp_hotplug_thread *ht = td->ht;
108
109 while (1) {
110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable();
112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING);
114 preempt_enable();
115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu));
117 kfree(td);
118 return 0;
119 }
120
121 if (kthread_should_park()) {
122 __set_current_state(TASK_RUNNING);
123 preempt_enable();
124 if (ht->park && td->status == HP_THREAD_ACTIVE) {
125 BUG_ON(td->cpu != smp_processor_id());
126 ht->park(td->cpu);
127 td->status = HP_THREAD_PARKED;
128 }
129 kthread_parkme();
130 /* We might have been woken for stop */
131 continue;
132 }
133
134 BUG_ON(td->cpu != smp_processor_id());
135
136 /* Check for state change setup */
137 switch (td->status) {
138 case HP_THREAD_NONE:
139 preempt_enable();
140 if (ht->setup)
141 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE;
143 preempt_disable();
144 break;
145 case HP_THREAD_PARKED:
146 preempt_enable();
147 if (ht->unpark)
148 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE;
150 preempt_disable();
151 break;
152 }
153
154 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable();
156 schedule();
157 } else {
158 set_current_state(TASK_RUNNING);
159 preempt_enable();
160 ht->thread_fn(td->cpu);
161 }
162 }
163}
164
165static int
166__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
167{
168 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
169 struct smpboot_thread_data *td;
170
171 if (tsk)
172 return 0;
173
174 td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
175 if (!td)
176 return -ENOMEM;
177 td->cpu = cpu;
178 td->ht = ht;
179
180 tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
181 ht->thread_comm);
182 if (IS_ERR(tsk)) {
183 kfree(td);
184 return PTR_ERR(tsk);
185 }
186
187 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk;
189 return 0;
190}
191
192int smpboot_create_threads(unsigned int cpu)
193{
194 struct smp_hotplug_thread *cur;
195 int ret = 0;
196
197 mutex_lock(&smpboot_threads_lock);
198 list_for_each_entry(cur, &hotplug_threads, list) {
199 ret = __smpboot_create_thread(cur, cpu);
200 if (ret)
201 break;
202 }
203 mutex_unlock(&smpboot_threads_lock);
204 return ret;
205}
206
207static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
208{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210
211 kthread_unpark(tsk);
212}
213
214void smpboot_unpark_threads(unsigned int cpu)
215{
216 struct smp_hotplug_thread *cur;
217
218 mutex_lock(&smpboot_threads_lock);
219 list_for_each_entry(cur, &hotplug_threads, list)
220 smpboot_unpark_thread(cur, cpu);
221 mutex_unlock(&smpboot_threads_lock);
222}
223
224static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227
228 if (tsk)
229 kthread_park(tsk);
230}
231
232void smpboot_park_threads(unsigned int cpu)
233{
234 struct smp_hotplug_thread *cur;
235
236 mutex_lock(&smpboot_threads_lock);
237 list_for_each_entry_reverse(cur, &hotplug_threads, list)
238 smpboot_park_thread(cur, cpu);
239 mutex_unlock(&smpboot_threads_lock);
240}
241
242static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
243{
244 unsigned int cpu;
245
246 /* We need to destroy also the parked threads of offline cpus */
247 for_each_possible_cpu(cpu) {
248 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
249
250 if (tsk) {
251 kthread_stop(tsk);
252 put_task_struct(tsk);
253 *per_cpu_ptr(ht->store, cpu) = NULL;
254 }
255 }
256}
257
258/**
259 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
260 * @plug_thread: Hotplug thread descriptor
261 *
262 * Creates and starts the threads on all online cpus.
263 */
264int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
265{
266 unsigned int cpu;
267 int ret = 0;
268
269 mutex_lock(&smpboot_threads_lock);
270 for_each_online_cpu(cpu) {
271 ret = __smpboot_create_thread(plug_thread, cpu);
272 if (ret) {
273 smpboot_destroy_threads(plug_thread);
274 goto out;
275 }
276 smpboot_unpark_thread(plug_thread, cpu);
277 }
278 list_add(&plug_thread->list, &hotplug_threads);
279out:
280 mutex_unlock(&smpboot_threads_lock);
281 return ret;
282}
283EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
284
285/**
286 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
287 * @plug_thread: Hotplug thread descriptor
288 *
289 * Stops all threads on all possible cpus.
290 */
291void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
292{
293 get_online_cpus();
294 mutex_lock(&smpboot_threads_lock);
295 list_del(&plug_thread->list);
296 smpboot_destroy_threads(plug_thread);
297 mutex_unlock(&smpboot_threads_lock);
298 put_online_cpus();
299}
300EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6ef9433e1c70..72415a0eb955 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
13static inline void idle_threads_init(void) { } 13static inline void idle_threads_init(void) { }
14#endif 14#endif
15 15
16int smpboot_create_threads(unsigned int cpu);
17void smpboot_park_threads(unsigned int cpu);
18void smpboot_unpark_threads(unsigned int cpu);
19
16#endif 20#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b73e681df09e..ed567babe789 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smpboot.h>
26#include <linux/tick.h> 27#include <linux/tick.h>
27 28
28#define CREATE_TRACE_POINTS 29#define CREATE_TRACE_POINTS
@@ -220,7 +221,7 @@ asmlinkage void __do_softirq(void)
220 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
221 222
222 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
223 account_system_vtime(current); 224 vtime_account_irq_enter(current);
224 225
225 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
226 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -271,7 +272,7 @@ restart:
271 272
272 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
273 274
274 account_system_vtime(current); 275 vtime_account_irq_exit(current);
275 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
277} 278}
@@ -340,7 +341,7 @@ static inline void invoke_softirq(void)
340 */ 341 */
341void irq_exit(void) 342void irq_exit(void)
342{ 343{
343 account_system_vtime(current); 344 vtime_account_irq_exit(current);
344 trace_hardirq_exit(); 345 trace_hardirq_exit();
345 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
346 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
@@ -742,49 +743,22 @@ void __init softirq_init(void)
742 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 743 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
743} 744}
744 745
745static int run_ksoftirqd(void * __bind_cpu) 746static int ksoftirqd_should_run(unsigned int cpu)
746{ 747{
747 set_current_state(TASK_INTERRUPTIBLE); 748 return local_softirq_pending();
748 749}
749 while (!kthread_should_stop()) {
750 preempt_disable();
751 if (!local_softirq_pending()) {
752 schedule_preempt_disabled();
753 }
754
755 __set_current_state(TASK_RUNNING);
756
757 while (local_softirq_pending()) {
758 /* Preempt disable stops cpu going offline.
759 If already offline, we'll be on wrong CPU:
760 don't process */
761 if (cpu_is_offline((long)__bind_cpu))
762 goto wait_to_die;
763 local_irq_disable();
764 if (local_softirq_pending())
765 __do_softirq();
766 local_irq_enable();
767 sched_preempt_enable_no_resched();
768 cond_resched();
769 preempt_disable();
770 rcu_note_context_switch((long)__bind_cpu);
771 }
772 preempt_enable();
773 set_current_state(TASK_INTERRUPTIBLE);
774 }
775 __set_current_state(TASK_RUNNING);
776 return 0;
777 750
778wait_to_die: 751static void run_ksoftirqd(unsigned int cpu)
779 preempt_enable(); 752{
780 /* Wait for kthread_stop */ 753 local_irq_disable();
781 set_current_state(TASK_INTERRUPTIBLE); 754 if (local_softirq_pending()) {
782 while (!kthread_should_stop()) { 755 __do_softirq();
783 schedule(); 756 rcu_note_context_switch(cpu);
784 set_current_state(TASK_INTERRUPTIBLE); 757 local_irq_enable();
758 cond_resched();
759 return;
785 } 760 }
786 __set_current_state(TASK_RUNNING); 761 local_irq_enable();
787 return 0;
788} 762}
789 763
790#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
850 unsigned long action, 824 unsigned long action,
851 void *hcpu) 825 void *hcpu)
852{ 826{
853 int hotcpu = (unsigned long)hcpu;
854 struct task_struct *p;
855
856 switch (action) { 827 switch (action) {
857 case CPU_UP_PREPARE:
858 case CPU_UP_PREPARE_FROZEN:
859 p = kthread_create_on_node(run_ksoftirqd,
860 hcpu,
861 cpu_to_node(hotcpu),
862 "ksoftirqd/%d", hotcpu);
863 if (IS_ERR(p)) {
864 printk("ksoftirqd for %i failed\n", hotcpu);
865 return notifier_from_errno(PTR_ERR(p));
866 }
867 kthread_bind(p, hotcpu);
868 per_cpu(ksoftirqd, hotcpu) = p;
869 break;
870 case CPU_ONLINE:
871 case CPU_ONLINE_FROZEN:
872 wake_up_process(per_cpu(ksoftirqd, hotcpu));
873 break;
874#ifdef CONFIG_HOTPLUG_CPU 828#ifdef CONFIG_HOTPLUG_CPU
875 case CPU_UP_CANCELED:
876 case CPU_UP_CANCELED_FROZEN:
877 if (!per_cpu(ksoftirqd, hotcpu))
878 break;
879 /* Unbind so it can run. Fall thru. */
880 kthread_bind(per_cpu(ksoftirqd, hotcpu),
881 cpumask_any(cpu_online_mask));
882 case CPU_DEAD: 829 case CPU_DEAD:
883 case CPU_DEAD_FROZEN: { 830 case CPU_DEAD_FROZEN:
884 static const struct sched_param param = { 831 takeover_tasklets((unsigned long)hcpu);
885 .sched_priority = MAX_RT_PRIO-1
886 };
887
888 p = per_cpu(ksoftirqd, hotcpu);
889 per_cpu(ksoftirqd, hotcpu) = NULL;
890 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
891 kthread_stop(p);
892 takeover_tasklets(hotcpu);
893 break; 832 break;
894 }
895#endif /* CONFIG_HOTPLUG_CPU */ 833#endif /* CONFIG_HOTPLUG_CPU */
896 } 834 }
897 return NOTIFY_OK; 835 return NOTIFY_OK;
898} 836}
899 837
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
901 .notifier_call = cpu_callback 839 .notifier_call = cpu_callback
902}; 840};
903 841
842static struct smp_hotplug_thread softirq_threads = {
843 .store = &ksoftirqd,
844 .thread_should_run = ksoftirqd_should_run,
845 .thread_fn = run_ksoftirqd,
846 .thread_comm = "ksoftirqd/%u",
847};
848
904static __init int spawn_ksoftirqd(void) 849static __init int spawn_ksoftirqd(void)
905{ 850{
906 void *cpu = (void *)(long)smp_processor_id();
907 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
908
909 BUG_ON(err != NOTIFY_OK);
910 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
911 register_cpu_notifier(&cpu_nfb); 851 register_cpu_notifier(&cpu_nfb);
852
853 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
854
912 return 0; 855 return 0;
913} 856}
914early_initcall(spawn_ksoftirqd); 857early_initcall(spawn_ksoftirqd);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2095be3318d5..2b859828cdc3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -379,7 +382,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
379 rcu_batch_queue(&sp->batch_queue, head); 382 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) { 383 if (!sp->running) {
381 sp->running = true; 384 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0); 385 schedule_delayed_work(&sp->work, 0);
383 } 386 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags); 387 spin_unlock_irqrestore(&sp->queue_lock, flags);
385} 388}
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 467 */
465void synchronize_srcu(struct srcu_struct *sp) 468void synchronize_srcu(struct srcu_struct *sp)
466{ 469{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 470 __synchronize_srcu(sp, rcu_expedited
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 473}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 474EXPORT_SYMBOL_GPL(synchronize_srcu);
470 475
@@ -631,13 +636,13 @@ static void srcu_reschedule(struct srcu_struct *sp)
631 } 636 }
632 637
633 if (pending) 638 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); 639 schedule_delayed_work(&sp->work, SRCU_INTERVAL);
635} 640}
636 641
637/* 642/*
638 * This is the work-queue function that handles SRCU grace periods. 643 * This is the work-queue function that handles SRCU grace periods.
639 */ 644 */
640static void process_srcu(struct work_struct *work) 645void process_srcu(struct work_struct *work)
641{ 646{
642 struct srcu_struct *sp; 647 struct srcu_struct *sp;
643 648
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 653 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 654 srcu_reschedule(sp);
650} 655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 241507f23eca..265b37690421 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
368void kernel_restart(char *cmd) 368void kernel_restart(char *cmd)
369{ 369{
370 kernel_restart_prepare(cmd); 370 kernel_restart_prepare(cmd);
371 disable_nonboot_cpus();
371 if (!cmd) 372 if (!cmd)
372 printk(KERN_EMERG "Restarting system.\n"); 373 printk(KERN_EMERG "Restarting system.\n");
373 else 374 else
@@ -1045,7 +1046,7 @@ void do_sys_times(struct tms *tms)
1045 cputime_t tgutime, tgstime, cutime, cstime; 1046 cputime_t tgutime, tgstime, cutime, cstime;
1046 1047
1047 spin_lock_irq(&current->sighand->siglock); 1048 spin_lock_irq(&current->sighand->siglock);
1048 thread_group_times(current, &tgutime, &tgstime); 1049 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
1049 cutime = current->signal->cutime; 1050 cutime = current->signal->cutime;
1050 cstime = current->signal->cstime; 1051 cstime = current->signal->cstime;
1051 spin_unlock_irq(&current->sighand->siglock); 1052 spin_unlock_irq(&current->sighand->siglock);
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);
1264 * Work around broken programs that cannot handle "Linux 3.0". 1265 * Work around broken programs that cannot handle "Linux 3.0".
1265 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1266 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1266 */ 1267 */
1267static int override_release(char __user *release, int len) 1268static int override_release(char __user *release, size_t len)
1268{ 1269{
1269 int ret = 0; 1270 int ret = 0;
1270 char buf[65];
1271 1271
1272 if (current->personality & UNAME26) { 1272 if (current->personality & UNAME26) {
1273 char *rest = UTS_RELEASE; 1273 const char *rest = UTS_RELEASE;
1274 char buf[65] = { 0 };
1274 int ndots = 0; 1275 int ndots = 0;
1275 unsigned v; 1276 unsigned v;
1277 size_t copy;
1276 1278
1277 while (*rest) { 1279 while (*rest) {
1278 if (*rest == '.' && ++ndots >= 3) 1280 if (*rest == '.' && ++ndots >= 3)
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)
1282 rest++; 1284 rest++;
1283 } 1285 }
1284 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; 1286 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1285 snprintf(buf, len, "2.6.%u%s", v, rest); 1287 copy = clamp_t(size_t, len, 1, sizeof(buf));
1286 ret = copy_to_user(release, buf, len); 1288 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
1289 ret = copy_to_user(release, buf, copy + 1);
1287 } 1290 }
1288 return ret; 1291 return ret;
1289} 1292}
@@ -1701,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1701 utime = stime = 0; 1704 utime = stime = 0;
1702 1705
1703 if (who == RUSAGE_THREAD) { 1706 if (who == RUSAGE_THREAD) {
1704 task_times(current, &utime, &stime); 1707 task_cputime_adjusted(current, &utime, &stime);
1705 accumulate_thread_rusage(p, r); 1708 accumulate_thread_rusage(p, r);
1706 maxrss = p->signal->maxrss; 1709 maxrss = p->signal->maxrss;
1707 goto out; 1710 goto out;
@@ -1727,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1727 break; 1730 break;
1728 1731
1729 case RUSAGE_SELF: 1732 case RUSAGE_SELF:
1730 thread_group_times(p, &tgutime, &tgstime); 1733 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1731 utime += tgutime; 1734 utime += tgutime;
1732 stime += tgstime; 1735 stime += tgstime;
1733 r->ru_nvcsw += p->signal->nvcsw; 1736 r->ru_nvcsw += p->signal->nvcsw;
@@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask)
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1791#ifdef CONFIG_CHECKPOINT_RESTORE
1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1790{ 1793{
1791 struct file *exe_file; 1794 struct fd exe;
1792 struct dentry *dentry; 1795 struct dentry *dentry;
1793 int err; 1796 int err;
1794 1797
1795 exe_file = fget(fd); 1798 exe = fdget(fd);
1796 if (!exe_file) 1799 if (!exe.file)
1797 return -EBADF; 1800 return -EBADF;
1798 1801
1799 dentry = exe_file->f_path.dentry; 1802 dentry = exe.file->f_path.dentry;
1800 1803
1801 /* 1804 /*
1802 * Because the original mm->exe_file points to executable file, make 1805 * Because the original mm->exe_file points to executable file, make
@@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1805 */ 1808 */
1806 err = -EACCES; 1809 err = -EACCES;
1807 if (!S_ISREG(dentry->d_inode->i_mode) || 1810 if (!S_ISREG(dentry->d_inode->i_mode) ||
1808 exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1809 goto exit; 1812 goto exit;
1810 1813
1811 err = inode_permission(dentry->d_inode, MAY_EXEC); 1814 err = inode_permission(dentry->d_inode, MAY_EXEC);
@@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1839 goto exit_unlock; 1842 goto exit_unlock;
1840 1843
1841 err = 0; 1844 err = 0;
1842 set_mm_exe_file(mm, exe_file); 1845 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1843exit_unlock: 1846exit_unlock:
1844 up_write(&mm->mmap_sem); 1847 up_write(&mm->mmap_sem);
1845 1848
1846exit: 1849exit:
1847 fput(exe_file); 1850 fdput(exe);
1848 return err; 1851 return err;
1849} 1852}
1850 1853
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)
2204 return -ENOMEM; 2207 return -ENOMEM;
2205 } 2208 }
2206 2209
2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2208 NULL, argv_cleanup, NULL); 2211 NULL, argv_cleanup, NULL);
2209 if (ret == -ENOMEM) 2212 if (ret == -ENOMEM)
2210 argv_free(argv); 2213 argv_free(argv);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 25cond_syscall(sys_kexec_load);
26cond_syscall(compat_sys_kexec_load); 26cond_syscall(compat_sys_kexec_load);
27cond_syscall(sys_init_module); 27cond_syscall(sys_init_module);
28cond_syscall(sys_finit_module);
28cond_syscall(sys_delete_module); 29cond_syscall(sys_delete_module);
29cond_syscall(sys_socketpair); 30cond_syscall(sys_socketpair);
30cond_syscall(sys_bind); 31cond_syscall(sys_bind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef59161..c88878db491e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
97extern int sysctl_overcommit_memory; 97extern int sysctl_overcommit_memory;
98extern int sysctl_overcommit_ratio; 98extern int sysctl_overcommit_ratio;
99extern int max_threads; 99extern int max_threads;
100extern int core_uses_pid;
101extern int suid_dumpable; 100extern int suid_dumpable;
101#ifdef CONFIG_COREDUMP
102extern int core_uses_pid;
102extern char core_pattern[]; 103extern char core_pattern[];
103extern unsigned int core_pipe_limit; 104extern unsigned int core_pipe_limit;
105#endif
104extern int pid_max; 106extern int pid_max;
105extern int min_free_kbytes; 107extern int min_free_kbytes;
106extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
177 179
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, 180static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos); 181 void __user *buffer, size_t *lenp, loff_t *ppos);
182#ifdef CONFIG_COREDUMP
180static int proc_dostring_coredump(struct ctl_table *table, int write, 183static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos); 184 void __user *buffer, size_t *lenp, loff_t *ppos);
185#endif
182 186
183#ifdef CONFIG_MAGIC_SYSRQ 187#ifdef CONFIG_MAGIC_SYSRQ
184/* Note: sysrq code uses it's own private copy */ 188/* Note: sysrq code uses it's own private copy */
@@ -252,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
252static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
253static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
254static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
255static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
256static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
257#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
258 264
259#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
260static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -297,6 +303,7 @@ static struct ctl_table kern_table[] = {
297 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
298 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
299 }, 305 },
306#ifdef CONFIG_SMP
300 { 307 {
301 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
302 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -307,7 +314,7 @@ static struct ctl_table kern_table[] = {
307 .extra2 = &max_sched_tunable_scaling, 314 .extra2 = &max_sched_tunable_scaling,
308 }, 315 },
309 { 316 {
310 .procname = "sched_migration_cost", 317 .procname = "sched_migration_cost_ns",
311 .data = &sysctl_sched_migration_cost, 318 .data = &sysctl_sched_migration_cost,
312 .maxlen = sizeof(unsigned int), 319 .maxlen = sizeof(unsigned int),
313 .mode = 0644, 320 .mode = 0644,
@@ -321,14 +328,14 @@ static struct ctl_table kern_table[] = {
321 .proc_handler = proc_dointvec, 328 .proc_handler = proc_dointvec,
322 }, 329 },
323 { 330 {
324 .procname = "sched_time_avg", 331 .procname = "sched_time_avg_ms",
325 .data = &sysctl_sched_time_avg, 332 .data = &sysctl_sched_time_avg,
326 .maxlen = sizeof(unsigned int), 333 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 334 .mode = 0644,
328 .proc_handler = proc_dointvec, 335 .proc_handler = proc_dointvec,
329 }, 336 },
330 { 337 {
331 .procname = "sched_shares_window", 338 .procname = "sched_shares_window_ns",
332 .data = &sysctl_sched_shares_window, 339 .data = &sysctl_sched_shares_window,
333 .maxlen = sizeof(unsigned int), 340 .maxlen = sizeof(unsigned int),
334 .mode = 0644, 341 .mode = 0644,
@@ -343,7 +350,45 @@ static struct ctl_table kern_table[] = {
343 .extra1 = &zero, 350 .extra1 = &zero,
344 .extra2 = &one, 351 .extra2 = &one,
345 }, 352 },
346#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_delay_ms",
357 .data = &sysctl_numa_balancing_scan_delay,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_min_ms",
364 .data = &sysctl_numa_balancing_scan_period_min,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369 {
370 .procname = "numa_balancing_scan_period_reset",
371 .data = &sysctl_numa_balancing_scan_period_reset,
372 .maxlen = sizeof(unsigned int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
377 .procname = "numa_balancing_scan_period_max_ms",
378 .data = &sysctl_numa_balancing_scan_period_max,
379 .maxlen = sizeof(unsigned int),
380 .mode = 0644,
381 .proc_handler = proc_dointvec,
382 },
383 {
384 .procname = "numa_balancing_scan_size_mb",
385 .data = &sysctl_numa_balancing_scan_size,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec,
389 },
390#endif /* CONFIG_NUMA_BALANCING */
391#endif /* CONFIG_SCHED_DEBUG */
347 { 392 {
348 .procname = "sched_rt_period_us", 393 .procname = "sched_rt_period_us",
349 .data = &sysctl_sched_rt_period, 394 .data = &sysctl_sched_rt_period,
@@ -404,6 +449,7 @@ static struct ctl_table kern_table[] = {
404 .mode = 0644, 449 .mode = 0644,
405 .proc_handler = proc_dointvec, 450 .proc_handler = proc_dointvec,
406 }, 451 },
452#ifdef CONFIG_COREDUMP
407 { 453 {
408 .procname = "core_uses_pid", 454 .procname = "core_uses_pid",
409 .data = &core_uses_pid, 455 .data = &core_uses_pid,
@@ -425,6 +471,7 @@ static struct ctl_table kern_table[] = {
425 .mode = 0644, 471 .mode = 0644,
426 .proc_handler = proc_dointvec, 472 .proc_handler = proc_dointvec,
427 }, 473 },
474#endif
428#ifdef CONFIG_PROC_SYSCTL 475#ifdef CONFIG_PROC_SYSCTL
429 { 476 {
430 .procname = "tainted", 477 .procname = "tainted",
@@ -559,7 +606,7 @@ static struct ctl_table kern_table[] = {
559 .extra2 = &one, 606 .extra2 = &one,
560 }, 607 },
561#endif 608#endif
562#ifdef CONFIG_HOTPLUG 609
563 { 610 {
564 .procname = "hotplug", 611 .procname = "hotplug",
565 .data = &uevent_helper, 612 .data = &uevent_helper,
@@ -567,7 +614,7 @@ static struct ctl_table kern_table[] = {
567 .mode = 0644, 614 .mode = 0644,
568 .proc_handler = proc_dostring, 615 .proc_handler = proc_dostring,
569 }, 616 },
570#endif 617
571#ifdef CONFIG_CHR_DEV_SG 618#ifdef CONFIG_CHR_DEV_SG
572 { 619 {
573 .procname = "sg-big-buff", 620 .procname = "sg-big-buff",
@@ -1543,8 +1590,7 @@ static struct ctl_table fs_table[] = {
1543}; 1590};
1544 1591
1545static struct ctl_table debug_table[] = { 1592static struct ctl_table debug_table[] = {
1546#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1593#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
1547 defined(CONFIG_S390) || defined(CONFIG_TILE)
1548 { 1594 {
1549 .procname = "exception-trace", 1595 .procname = "exception-trace",
1550 .data = &show_unhandled_signals, 1596 .data = &show_unhandled_signals,
@@ -2036,12 +2082,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2036 2082
2037static void validate_coredump_safety(void) 2083static void validate_coredump_safety(void)
2038{ 2084{
2085#ifdef CONFIG_COREDUMP
2039 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2086 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2040 core_pattern[0] != '/' && core_pattern[0] != '|') { 2087 core_pattern[0] != '/' && core_pattern[0] != '|') {
2041 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2088 printk(KERN_WARNING "Unsafe core_pattern used with "\
2042 "suid_dumpable=2. Pipe handler or fully qualified "\ 2089 "suid_dumpable=2. Pipe handler or fully qualified "\
2043 "core dump path required.\n"); 2090 "core dump path required.\n");
2044 } 2091 }
2092#endif
2045} 2093}
2046 2094
2047static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, 2095static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2101,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2053 return error; 2101 return error;
2054} 2102}
2055 2103
2104#ifdef CONFIG_COREDUMP
2056static int proc_dostring_coredump(struct ctl_table *table, int write, 2105static int proc_dostring_coredump(struct ctl_table *table, int write,
2057 void __user *buffer, size_t *lenp, loff_t *ppos) 2106 void __user *buffer, size_t *lenp, loff_t *ppos)
2058{ 2107{
@@ -2061,6 +2110,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
2061 validate_coredump_safety(); 2110 validate_coredump_safety();
2062 return error; 2111 return error;
2063} 2112}
2113#endif
2064 2114
2065static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2115static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2066 void __user *buffer, 2116 void __user *buffer,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..5a6384450501 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1344 goto out_putname;
1345 } 1345 }
1346 1346
1347 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = task_active_pid_ns(current)->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1350 if (IS_ERR(file))
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d320d44903bd..65bd3c92d6f3 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -2,26 +2,20 @@
2#include <linux/task_work.h> 2#include <linux/task_work.h>
3#include <linux/tracehook.h> 3#include <linux/tracehook.h>
4 4
5static struct callback_head work_exited; /* all we need is ->next == NULL */
6
5int 7int
6task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) 8task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
7{ 9{
8 struct callback_head *last, *first; 10 struct callback_head *head;
9 unsigned long flags;
10 11
11 /* 12 do {
12 * Not inserting the new work if the task has already passed 13 head = ACCESS_ONCE(task->task_works);
13 * exit_task_work() is the responisbility of callers. 14 if (unlikely(head == &work_exited))
14 */ 15 return -ESRCH;
15 raw_spin_lock_irqsave(&task->pi_lock, flags); 16 work->next = head;
16 last = task->task_works; 17 } while (cmpxchg(&task->task_works, head, work) != head);
17 first = last ? last->next : twork;
18 twork->next = first;
19 if (last)
20 last->next = twork;
21 task->task_works = twork;
22 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
23 18
24 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
25 if (notify) 19 if (notify)
26 set_notify_resume(task); 20 set_notify_resume(task);
27 return 0; 21 return 0;
@@ -30,52 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify
30struct callback_head * 24struct callback_head *
31task_work_cancel(struct task_struct *task, task_work_func_t func) 25task_work_cancel(struct task_struct *task, task_work_func_t func)
32{ 26{
27 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL;
33 unsigned long flags; 29 unsigned long flags;
34 struct callback_head *last, *res = NULL; 30 /*
35 31 * If cmpxchg() fails we continue without updating pprev.
32 * Either we raced with task_work_add() which added the
33 * new entry before this work, we will find it again. Or
34 * we raced with task_work_run(), *pprev == NULL/exited.
35 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags); 36 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 last = task->task_works; 37 while ((work = ACCESS_ONCE(*pprev))) {
38 if (last) { 38 read_barrier_depends();
39 struct callback_head *q = last, *p = q->next; 39 if (work->func != func)
40 while (1) { 40 pprev = &work->next;
41 if (p->func == func) { 41 else if (cmpxchg(pprev, work, work->next) == work)
42 q->next = p->next; 42 break;
43 if (p == last)
44 task->task_works = q == p ? NULL : q;
45 res = p;
46 break;
47 }
48 if (p == last)
49 break;
50 q = p;
51 p = q->next;
52 }
53 } 43 }
54 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 44 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
55 return res; 45
46 return work;
56} 47}
57 48
58void task_work_run(void) 49void task_work_run(void)
59{ 50{
60 struct task_struct *task = current; 51 struct task_struct *task = current;
61 struct callback_head *p, *q; 52 struct callback_head *work, *head, *next;
53
54 for (;;) {
55 /*
56 * work->func() can do task_work_add(), do not set
57 * work_exited unless the list is empty.
58 */
59 do {
60 work = ACCESS_ONCE(task->task_works);
61 head = !work && (task->flags & PF_EXITING) ?
62 &work_exited : NULL;
63 } while (cmpxchg(&task->task_works, work, head) != work);
62 64
63 while (1) { 65 if (!work)
64 raw_spin_lock_irq(&task->pi_lock); 66 break;
65 p = task->task_works; 67 /*
66 task->task_works = NULL; 68 * Synchronize with task_work_cancel(). It can't remove
67 raw_spin_unlock_irq(&task->pi_lock); 69 * the first entry == work, cmpxchg(task_works) should
70 * fail, but it can play with *work and other entries.
71 */
72 raw_spin_unlock_wait(&task->pi_lock);
73 smp_mb();
68 74
69 if (unlikely(!p)) 75 /* Reverse the list to run the works in fifo order */
70 return; 76 head = NULL;
77 do {
78 next = work->next;
79 work->next = head;
80 head = work;
81 work = next;
82 } while (work);
71 83
72 q = p->next; /* head */ 84 work = head;
73 p->next = NULL; /* cut it */ 85 do {
74 while (q) { 86 next = work->next;
75 p = q->next; 87 work->func(work);
76 q->func(q); 88 work = next;
77 q = p;
78 cond_resched(); 89 cond_resched();
79 } 90 } while (work);
80 } 91 }
81} 92}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d0a32796550f..145bb4d3bd4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
27#include <linux/cgroup.h> 27#include <linux/cgroup.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/pid_namespace.h>
30#include <net/genetlink.h> 31#include <net/genetlink.h>
31#include <linux/atomic.h> 32#include <linux/atomic.h>
32 33
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,
174 up_write(&listeners->sem); 175 up_write(&listeners->sem);
175} 176}
176 177
177static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178static void fill_stats(struct user_namespace *user_ns,
179 struct pid_namespace *pid_ns,
180 struct task_struct *tsk, struct taskstats *stats)
178{ 181{
179 memset(stats, 0, sizeof(*stats)); 182 memset(stats, 0, sizeof(*stats));
180 /* 183 /*
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
190 stats->version = TASKSTATS_VERSION; 193 stats->version = TASKSTATS_VERSION;
191 stats->nvcsw = tsk->nvcsw; 194 stats->nvcsw = tsk->nvcsw;
192 stats->nivcsw = tsk->nivcsw; 195 stats->nivcsw = tsk->nivcsw;
193 bacct_add_tsk(stats, tsk); 196 bacct_add_tsk(user_ns, pid_ns, stats, tsk);
194 197
195 /* fill in extended acct fields */ 198 /* fill in extended acct fields */
196 xacct_add_tsk(stats, tsk); 199 xacct_add_tsk(stats, tsk);
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
207 rcu_read_unlock(); 210 rcu_read_unlock();
208 if (!tsk) 211 if (!tsk)
209 return -ESRCH; 212 return -ESRCH;
210 fill_stats(tsk, stats); 213 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
211 put_task_struct(tsk); 214 put_task_struct(tsk);
212 return 0; 215 return 0;
213} 216}
@@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
291 if (!cpumask_subset(mask, cpu_possible_mask)) 294 if (!cpumask_subset(mask, cpu_possible_mask))
292 return -EINVAL; 295 return -EINVAL;
293 296
297 if (current_user_ns() != &init_user_ns)
298 return -EINVAL;
299
300 if (task_active_pid_ns(current) != &init_pid_ns)
301 return -EINVAL;
302
294 if (isadd == REGISTER) { 303 if (isadd == REGISTER) {
295 for_each_cpu(cpu, mask) { 304 for_each_cpu(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), 305 s = kmalloc_node(sizeof(struct listener),
@@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
415 struct nlattr *na; 424 struct nlattr *na;
416 size_t size; 425 size_t size;
417 u32 fd; 426 u32 fd;
418 struct file *file; 427 struct fd f;
419 int fput_needed;
420 428
421 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 429 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
422 if (!na) 430 if (!na)
423 return -EINVAL; 431 return -EINVAL;
424 432
425 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 433 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
426 file = fget_light(fd, &fput_needed); 434 f = fdget(fd);
427 if (!file) 435 if (!f.file)
428 return 0; 436 return 0;
429 437
430 size = nla_total_size(sizeof(struct cgroupstats)); 438 size = nla_total_size(sizeof(struct cgroupstats));
@@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 445 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 446 sizeof(struct cgroupstats));
439 if (na == NULL) { 447 if (na == NULL) {
448 nlmsg_free(rep_skb);
440 rc = -EMSGSIZE; 449 rc = -EMSGSIZE;
441 goto err; 450 goto err;
442 } 451 }
@@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
444 stats = nla_data(na); 453 stats = nla_data(na);
445 memset(stats, 0, sizeof(*stats)); 454 memset(stats, 0, sizeof(*stats));
446 455
447 rc = cgroupstats_build(stats, file->f_dentry); 456 rc = cgroupstats_build(stats, f.file->f_dentry);
448 if (rc < 0) { 457 if (rc < 0) {
449 nlmsg_free(rep_skb); 458 nlmsg_free(rep_skb);
450 goto err; 459 goto err;
@@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
453 rc = send_reply(rep_skb, info); 462 rc = send_reply(rep_skb, info);
454 463
455err: 464err:
456 fput_light(file, fput_needed); 465 fdput(f);
457 return rc; 466 return rc;
458} 467}
459 468
@@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
467 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 476 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
468 if (rc < 0) 477 if (rc < 0)
469 goto out; 478 goto out;
470 rc = add_del_listener(info->snd_pid, mask, REGISTER); 479 rc = add_del_listener(info->snd_portid, mask, REGISTER);
471out: 480out:
472 free_cpumask_var(mask); 481 free_cpumask_var(mask);
473 return rc; 482 return rc;
@@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
483 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 492 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
484 if (rc < 0) 493 if (rc < 0)
485 goto out; 494 goto out;
486 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 495 rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
487out: 496out:
488 free_cpumask_var(mask); 497 free_cpumask_var(mask);
489 return rc; 498 return rc;
@@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
631 if (rc < 0) 640 if (rc < 0)
632 return; 641 return;
633 642
634 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 643 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
644 task_pid_nr_ns(tsk, &init_pid_ns));
635 if (!stats) 645 if (!stats)
636 goto err; 646 goto err;
637 647
638 fill_stats(tsk, stats); 648 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
639 649
640 /* 650 /*
641 * Doesn't matter if tsk is the leader or the last group member leaving 651 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
643 if (!is_thread_group || !group_dead) 653 if (!is_thread_group || !group_dead)
644 goto send; 654 goto send;
645 655
646 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 656 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
657 task_tgid_nr_ns(tsk, &init_pid_ns));
647 if (!stats) 658 if (!stats)
648 goto err; 659 goto err;
649 660
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
30#include <linux/export.h> 30#include <linux/export.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h> 33#include <linux/timekeeper_internal.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
16config GENERIC_TIME_VSYSCALL 16config GENERIC_TIME_VSYSCALL
17 bool 17 bool
18 18
19# Timekeeping vsyscall support
20config GENERIC_TIME_VSYSCALL_OLD
21 bool
22
19# ktime_t scalar 64bit nsec representation 23# ktime_t scalar 64bit nsec representation
20config KTIME_SCALAR 24config KTIME_SCALAR
21 bool 25 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c2..ff7d9d2ab504 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
37static struct alarm_base { 37static struct alarm_base {
38 spinlock_t lock; 38 spinlock_t lock;
39 struct timerqueue_head timerqueue; 39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void); 40 ktime_t (*gettime)(void);
42 clockid_t base_clockid; 41 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE]; 42} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
46static ktime_t freezer_delta; 45static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock); 46static DEFINE_SPINLOCK(freezer_delta_lock);
48 47
48static struct wakeup_source *ws;
49
49#ifdef CONFIG_RTC_CLASS 50#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */ 51/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer; 52static struct rtc_timer rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
130 * @base: pointer to the base where the timer is being run 131 * @base: pointer to the base where the timer is being run
131 * @alarm: pointer to alarm being enqueued. 132 * @alarm: pointer to alarm being enqueued.
132 * 133 *
133 * Adds alarm to a alarm_base timerqueue and if necessary sets 134 * Adds alarm to a alarm_base timerqueue
134 * an hrtimer to run.
135 * 135 *
136 * Must hold base->lock when calling. 136 * Must hold base->lock when calling.
137 */ 137 */
138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
139{ 139{
140 if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
141 timerqueue_del(&base->timerqueue, &alarm->node);
142
140 timerqueue_add(&base->timerqueue, &alarm->node); 143 timerqueue_add(&base->timerqueue, &alarm->node);
141 alarm->state |= ALARMTIMER_STATE_ENQUEUED; 144 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
142
143 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
144 hrtimer_try_to_cancel(&base->timer);
145 hrtimer_start(&base->timer, alarm->node.expires,
146 HRTIMER_MODE_ABS);
147 }
148} 145}
149 146
150/** 147/**
151 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue 148 * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
152 * @base: pointer to the base where the timer is running 149 * @base: pointer to the base where the timer is running
153 * @alarm: pointer to alarm being removed 150 * @alarm: pointer to alarm being removed
154 * 151 *
155 * Removes alarm to a alarm_base timerqueue and if necessary sets 152 * Removes alarm to a alarm_base timerqueue
156 * a new timer to run.
157 * 153 *
158 * Must hold base->lock when calling. 154 * Must hold base->lock when calling.
159 */ 155 */
160static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) 156static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
161{ 157{
162 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
163
164 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) 158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
165 return; 159 return;
166 160
167 timerqueue_del(&base->timerqueue, &alarm->node); 161 timerqueue_del(&base->timerqueue, &alarm->node);
168 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; 162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
169
170 if (next == &alarm->node) {
171 hrtimer_try_to_cancel(&base->timer);
172 next = timerqueue_getnext(&base->timerqueue);
173 if (!next)
174 return;
175 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
176 }
177} 163}
178 164
179 165
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
188 */ 174 */
189static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) 175static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
190{ 176{
191 struct alarm_base *base = container_of(timer, struct alarm_base, timer); 177 struct alarm *alarm = container_of(timer, struct alarm, timer);
192 struct timerqueue_node *next; 178 struct alarm_base *base = &alarm_bases[alarm->type];
193 unsigned long flags; 179 unsigned long flags;
194 ktime_t now;
195 int ret = HRTIMER_NORESTART; 180 int ret = HRTIMER_NORESTART;
196 int restart = ALARMTIMER_NORESTART; 181 int restart = ALARMTIMER_NORESTART;
197 182
198 spin_lock_irqsave(&base->lock, flags); 183 spin_lock_irqsave(&base->lock, flags);
199 now = base->gettime(); 184 alarmtimer_dequeue(base, alarm);
200 while ((next = timerqueue_getnext(&base->timerqueue))) { 185 spin_unlock_irqrestore(&base->lock, flags);
201 struct alarm *alarm;
202 ktime_t expired = next->expires;
203
204 if (expired.tv64 > now.tv64)
205 break;
206
207 alarm = container_of(next, struct alarm, node);
208
209 timerqueue_del(&base->timerqueue, &alarm->node);
210 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
211
212 alarm->state |= ALARMTIMER_STATE_CALLBACK;
213 spin_unlock_irqrestore(&base->lock, flags);
214 if (alarm->function)
215 restart = alarm->function(alarm, now);
216 spin_lock_irqsave(&base->lock, flags);
217 alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
218 186
219 if (restart != ALARMTIMER_NORESTART) { 187 if (alarm->function)
220 timerqueue_add(&base->timerqueue, &alarm->node); 188 restart = alarm->function(alarm, base->gettime());
221 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
222 }
223 }
224 189
225 if (next) { 190 spin_lock_irqsave(&base->lock, flags);
226 hrtimer_set_expires(&base->timer, next->expires); 191 if (restart != ALARMTIMER_NORESTART) {
192 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
193 alarmtimer_enqueue(base, alarm);
227 ret = HRTIMER_RESTART; 194 ret = HRTIMER_RESTART;
228 } 195 }
229 spin_unlock_irqrestore(&base->lock, flags); 196 spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
250 unsigned long flags; 217 unsigned long flags;
251 struct rtc_device *rtc; 218 struct rtc_device *rtc;
252 int i; 219 int i;
220 int ret;
253 221
254 spin_lock_irqsave(&freezer_delta_lock, flags); 222 spin_lock_irqsave(&freezer_delta_lock, flags);
255 min = freezer_delta; 223 min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
279 if (min.tv64 == 0) 247 if (min.tv64 == 0)
280 return 0; 248 return 0;
281 249
282 /* XXX - Should we enforce a minimum sleep time? */ 250 if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
283 WARN_ON(min.tv64 < NSEC_PER_SEC); 251 __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
252 return -EBUSY;
253 }
284 254
285 /* Setup an rtc timer to fire that far in the future */ 255 /* Setup an rtc timer to fire that far in the future */
286 rtc_timer_cancel(rtc, &rtctimer); 256 rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
288 now = rtc_tm_to_ktime(tm); 258 now = rtc_tm_to_ktime(tm);
289 now = ktime_add(now, min); 259 now = ktime_add(now, min);
290 260
291 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); 261 /* Set alarm, if in the past reject suspend briefly to handle */
292 262 ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
293 return 0; 263 if (ret < 0)
264 __pm_wakeup_event(ws, MSEC_PER_SEC);
265 return ret;
294} 266}
295#else 267#else
296static int alarmtimer_suspend(struct device *dev) 268static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
324 enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) 296 enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
325{ 297{
326 timerqueue_init(&alarm->node); 298 timerqueue_init(&alarm->node);
299 hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
300 HRTIMER_MODE_ABS);
301 alarm->timer.function = alarmtimer_fired;
327 alarm->function = function; 302 alarm->function = function;
328 alarm->type = type; 303 alarm->type = type;
329 alarm->state = ALARMTIMER_STATE_INACTIVE; 304 alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
334 * @alarm: ptr to alarm to set 309 * @alarm: ptr to alarm to set
335 * @start: time to run the alarm 310 * @start: time to run the alarm
336 */ 311 */
337void alarm_start(struct alarm *alarm, ktime_t start) 312int alarm_start(struct alarm *alarm, ktime_t start)
338{ 313{
339 struct alarm_base *base = &alarm_bases[alarm->type]; 314 struct alarm_base *base = &alarm_bases[alarm->type];
340 unsigned long flags; 315 unsigned long flags;
316 int ret;
341 317
342 spin_lock_irqsave(&base->lock, flags); 318 spin_lock_irqsave(&base->lock, flags);
343 if (alarmtimer_active(alarm))
344 alarmtimer_remove(base, alarm);
345 alarm->node.expires = start; 319 alarm->node.expires = start;
346 alarmtimer_enqueue(base, alarm); 320 alarmtimer_enqueue(base, alarm);
321 ret = hrtimer_start(&alarm->timer, alarm->node.expires,
322 HRTIMER_MODE_ABS);
347 spin_unlock_irqrestore(&base->lock, flags); 323 spin_unlock_irqrestore(&base->lock, flags);
324 return ret;
348} 325}
349 326
350/** 327/**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
358{ 335{
359 struct alarm_base *base = &alarm_bases[alarm->type]; 336 struct alarm_base *base = &alarm_bases[alarm->type];
360 unsigned long flags; 337 unsigned long flags;
361 int ret = -1; 338 int ret;
362 spin_lock_irqsave(&base->lock, flags);
363
364 if (alarmtimer_callback_running(alarm))
365 goto out;
366 339
367 if (alarmtimer_is_queued(alarm)) { 340 spin_lock_irqsave(&base->lock, flags);
368 alarmtimer_remove(base, alarm); 341 ret = hrtimer_try_to_cancel(&alarm->timer);
369 ret = 1; 342 if (ret >= 0)
370 } else 343 alarmtimer_dequeue(base, alarm);
371 ret = 0;
372out:
373 spin_unlock_irqrestore(&base->lock, flags); 344 spin_unlock_irqrestore(&base->lock, flags);
374 return ret; 345 return ret;
375} 346}
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
802 for (i = 0; i < ALARM_NUMTYPE; i++) { 773 for (i = 0; i < ALARM_NUMTYPE; i++) {
803 timerqueue_init_head(&alarm_bases[i].timerqueue); 774 timerqueue_init_head(&alarm_bases[i].timerqueue);
804 spin_lock_init(&alarm_bases[i].lock); 775 spin_lock_init(&alarm_bases[i].lock);
805 hrtimer_init(&alarm_bases[i].timer,
806 alarm_bases[i].base_clockid,
807 HRTIMER_MODE_ABS);
808 alarm_bases[i].timer.function = alarmtimer_fired;
809 } 776 }
810 777
811 error = alarmtimer_rtc_interface_setup(); 778 error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
821 error = PTR_ERR(pdev); 788 error = PTR_ERR(pdev);
822 goto out_drv; 789 goto out_drv;
823 } 790 }
791 ws = wakeup_source_register("alarmtimer");
824 return 0; 792 return 0;
825 793
826out_drv: 794out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 7e1ce012a851..30b6de0d977c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old,
397 local_irq_restore(flags); 397 local_irq_restore(flags);
398} 398}
399 399
400/**
401 * clockevents_suspend - suspend clock devices
402 */
403void clockevents_suspend(void)
404{
405 struct clock_event_device *dev;
406
407 list_for_each_entry_reverse(dev, &clockevent_devices, list)
408 if (dev->suspend)
409 dev->suspend(dev);
410}
411
412/**
413 * clockevents_resume - resume clock devices
414 */
415void clockevents_resume(void)
416{
417 struct clock_event_device *dev;
418
419 list_for_each_entry(dev, &clockevent_devices, list)
420 if (dev->resume)
421 dev->resume(dev);
422}
423
400#ifdef CONFIG_GENERIC_CLOCKEVENTS 424#ifdef CONFIG_GENERIC_CLOCKEVENTS
401/** 425/**
402 * clockevents_notify - notification about relevant events 426 * clockevents_notify - notification about relevant events
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 46da0537c10b..7a925ba456fb 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) 40#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
58 return (cycle_t) jiffies; 58 return (cycle_t) jiffies;
59} 59}
60 60
61struct clocksource clocksource_jiffies = { 61static struct clocksource clocksource_jiffies = {
62 .name = "jiffies", 62 .name = "jiffies",
63 .rating = 1, /* lowest valid rating*/ 63 .rating = 1, /* lowest valid rating*/
64 .read = jiffies_read, 64 .read = jiffies_read,
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = {
67 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
68}; 68};
69 69
70__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
71
70#if (BITS_PER_LONG < 64) 72#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void) 73u64 get_jiffies_64(void)
72{ 74{
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void)
74 u64 ret; 76 u64 ret;
75 77
76 do { 78 do {
77 seq = read_seqbegin(&xtime_lock); 79 seq = read_seqbegin(&jiffies_lock);
78 ret = jiffies_64; 80 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq)); 81 } while (read_seqretry(&jiffies_lock, seq));
80 return ret; 82 return ret;
81} 83}
82EXPORT_SYMBOL(get_jiffies_64); 84EXPORT_SYMBOL(get_jiffies_64);
@@ -95,3 +97,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
95{ 97{
96 return &clocksource_jiffies; 98 return &clocksource_jiffies;
97} 99}
100
101struct clocksource refined_jiffies;
102
103int register_refined_jiffies(long cycles_per_second)
104{
105 u64 nsec_per_tick, shift_hz;
106 long cycles_per_tick;
107
108
109
110 refined_jiffies = clocksource_jiffies;
111 refined_jiffies.name = "refined-jiffies";
112 refined_jiffies.rating++;
113
114 /* Calc cycles per tick */
115 cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
116 /* shift_hz stores hz<<8 for extra accuracy */
117 shift_hz = (u64)cycles_per_second << 8;
118 shift_hz += cycles_per_tick/2;
119 do_div(shift_hz, cycles_per_tick);
120 /* Calculate nsec_per_tick using shift_hz */
121 nsec_per_tick = (u64)NSEC_PER_SEC << 8;
122 nsec_per_tick += (u32)shift_hz/2;
123 do_div(nsec_per_tick, (u32)shift_hz);
124
125 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
126
127 clocksource_register(&refined_jiffies);
128 return 0;
129}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..b1600a6973f4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
63static void tick_periodic(int cpu) 63static void tick_periodic(int cpu)
64{ 64{
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&xtime_lock); 66 write_seqlock(&jiffies_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&xtime_lock); 72 write_sequnlock(&jiffies_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&xtime_lock); 133 seq = read_seqbegin(&jiffies_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&xtime_lock, seq)); 135 } while (read_seqretry(&jiffies_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fed..cf3e59ed6dc0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
141#endif 141#endif
142 142
143extern void do_timer(unsigned long ticks); 143extern void do_timer(unsigned long ticks);
144extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3a9e5d5c1091..d58e552d9fd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 32
33/* 33/*
34 * The time, when the last jiffy update happened. Protected by xtime_lock. 34 * The time, when the last jiffy update happened. Protected by jiffies_lock.
35 */ 35 */
36static ktime_t last_jiffies_update; 36static ktime_t last_jiffies_update;
37 37
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now)
49 ktime_t delta; 49 ktime_t delta;
50 50
51 /* 51 /*
52 * Do a quick check without holding xtime_lock: 52 * Do a quick check without holding jiffies_lock:
53 */ 53 */
54 delta = ktime_sub(now, last_jiffies_update); 54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64) 55 if (delta.tv64 < tick_period.tv64)
56 return; 56 return;
57 57
58 /* Reevalute with xtime_lock held */ 58 /* Reevalute with jiffies_lock held */
59 write_seqlock(&xtime_lock); 59 write_seqlock(&jiffies_lock);
60 60
61 delta = ktime_sub(now, last_jiffies_update); 61 delta = ktime_sub(now, last_jiffies_update);
62 if (delta.tv64 >= tick_period.tv64) { 62 if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now)
79 /* Keep the tick_next_period variable up to date */ 79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period); 80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
81 } 81 }
82 write_sequnlock(&xtime_lock); 82 write_sequnlock(&jiffies_lock);
83} 83}
84 84
85/* 85/*
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void)
89{ 89{
90 ktime_t period; 90 ktime_t period;
91 91
92 write_seqlock(&xtime_lock); 92 write_seqlock(&jiffies_lock);
93 /* Did we start the jiffies update yet ? */ 93 /* Did we start the jiffies update yet ? */
94 if (last_jiffies_update.tv64 == 0) 94 if (last_jiffies_update.tv64 == 0)
95 last_jiffies_update = tick_next_period; 95 last_jiffies_update = tick_next_period;
96 period = last_jiffies_update; 96 period = last_jiffies_update;
97 write_sequnlock(&xtime_lock); 97 write_sequnlock(&jiffies_lock);
98 return period; 98 return period;
99} 99}
100 100
101
102static void tick_sched_do_timer(ktime_t now)
103{
104 int cpu = smp_processor_id();
105
106#ifdef CONFIG_NO_HZ
107 /*
108 * Check if the do_timer duty was dropped. We don't care about
109 * concurrency: This happens only when the cpu in charge went
110 * into a long sleep. If two cpus happen to assign themself to
111 * this duty, then the jiffies update is still serialized by
112 * jiffies_lock.
113 */
114 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
115 tick_do_timer_cpu = cpu;
116#endif
117
118 /* Check, if the jiffies need an update */
119 if (tick_do_timer_cpu == cpu)
120 tick_do_update_jiffies64(now);
121}
122
123static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
124{
125#ifdef CONFIG_NO_HZ
126 /*
127 * When we are idle and the tick is stopped, we have to touch
128 * the watchdog as we might not schedule for a really long
129 * time. This happens on complete idle SMP systems while
130 * waiting on the login prompt. We also increment the "start of
131 * idle" jiffy stamp so the idle accounting adjustment we do
132 * when we go busy again does not account too much ticks.
133 */
134 if (ts->tick_stopped) {
135 touch_softlockup_watchdog();
136 if (is_idle_task(current))
137 ts->idle_jiffies++;
138 }
139#endif
140 update_process_times(user_mode(regs));
141 profile_tick(CPU_PROFILING);
142}
143
101/* 144/*
102 * NOHZ - aka dynamic tick functionality 145 * NOHZ - aka dynamic tick functionality
103 */ 146 */
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
282 325
283 /* Read jiffies and the time when jiffies were updated last */ 326 /* Read jiffies and the time when jiffies were updated last */
284 do { 327 do {
285 seq = read_seqbegin(&xtime_lock); 328 seq = read_seqbegin(&jiffies_lock);
286 last_update = last_jiffies_update; 329 last_update = last_jiffies_update;
287 last_jiffies = jiffies; 330 last_jiffies = jiffies;
288 time_delta = timekeeping_max_deferment(); 331 time_delta = timekeeping_max_deferment();
289 } while (read_seqretry(&xtime_lock, seq)); 332 } while (read_seqretry(&jiffies_lock, seq));
290 333
291 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
292 arch_needs_cpu(cpu)) { 335 arch_needs_cpu(cpu)) {
@@ -372,7 +415,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
372 * the scheduler tick in nohz_restart_sched_tick. 415 * the scheduler tick in nohz_restart_sched_tick.
373 */ 416 */
374 if (!ts->tick_stopped) { 417 if (!ts->tick_stopped) {
375 select_nohz_load_balancer(1); 418 nohz_balance_enter_idle(cpu);
376 calc_load_enter_idle(); 419 calc_load_enter_idle();
377 420
378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 421 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -436,7 +479,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 479 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
437 static int ratelimit; 480 static int ratelimit;
438 481
439 if (ratelimit < 10) { 482 if (ratelimit < 10 &&
483 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
440 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 484 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
441 (unsigned int) local_softirq_pending()); 485 (unsigned int) local_softirq_pending());
442 ratelimit++; 486 ratelimit++;
@@ -525,6 +569,8 @@ void tick_nohz_irq_exit(void)
525 if (!ts->inidle) 569 if (!ts->inidle)
526 return; 570 return;
527 571
572 /* Cancel the timer because CPU already waken up from the C-states*/
573 menu_hrtimer_cancel();
528 __tick_nohz_idle_enter(ts); 574 __tick_nohz_idle_enter(ts);
529} 575}
530 576
@@ -569,7 +615,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
569static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 615static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
570{ 616{
571 /* Update jiffies first */ 617 /* Update jiffies first */
572 select_nohz_load_balancer(0);
573 tick_do_update_jiffies64(now); 618 tick_do_update_jiffies64(now);
574 update_cpu_load_nohz(); 619 update_cpu_load_nohz();
575 620
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void)
621 666
622 ts->inidle = 0; 667 ts->inidle = 0;
623 668
669 /* Cancel the timer because CPU already waken up from the C-states*/
670 menu_hrtimer_cancel();
624 if (ts->idle_active || ts->tick_stopped) 671 if (ts->idle_active || ts->tick_stopped)
625 now = ktime_get(); 672 now = ktime_get();
626 673
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
648{ 695{
649 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 696 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
650 struct pt_regs *regs = get_irq_regs(); 697 struct pt_regs *regs = get_irq_regs();
651 int cpu = smp_processor_id();
652 ktime_t now = ktime_get(); 698 ktime_t now = ktime_get();
653 699
654 dev->next_event.tv64 = KTIME_MAX; 700 dev->next_event.tv64 = KTIME_MAX;
655 701
656 /* 702 tick_sched_do_timer(now);
657 * Check if the do_timer duty was dropped. We don't care about 703 tick_sched_handle(ts, regs);
658 * concurrency: This happens only when the cpu in charge went
659 * into a long sleep. If two cpus happen to assign themself to
660 * this duty, then the jiffies update is still serialized by
661 * xtime_lock.
662 */
663 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
664 tick_do_timer_cpu = cpu;
665
666 /* Check, if the jiffies need an update */
667 if (tick_do_timer_cpu == cpu)
668 tick_do_update_jiffies64(now);
669
670 /*
671 * When we are idle and the tick is stopped, we have to touch
672 * the watchdog as we might not schedule for a really long
673 * time. This happens on complete idle SMP systems while
674 * waiting on the login prompt. We also increment the "start
675 * of idle" jiffy stamp so the idle accounting adjustment we
676 * do when we go busy again does not account too much ticks.
677 */
678 if (ts->tick_stopped) {
679 touch_softlockup_watchdog();
680 ts->idle_jiffies++;
681 }
682
683 update_process_times(user_mode(regs));
684 profile_tick(CPU_PROFILING);
685 704
686 while (tick_nohz_reprogram(ts, now)) { 705 while (tick_nohz_reprogram(ts, now)) {
687 now = ktime_get(); 706 now = ktime_get();
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu)
794#ifdef CONFIG_HIGH_RES_TIMERS 813#ifdef CONFIG_HIGH_RES_TIMERS
795/* 814/*
796 * We rearm the timer until we get disabled by the idle code. 815 * We rearm the timer until we get disabled by the idle code.
797 * Called with interrupts disabled and timer->base->cpu_base->lock held. 816 * Called with interrupts disabled.
798 */ 817 */
799static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 818static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
800{ 819{
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
802 container_of(timer, struct tick_sched, sched_timer); 821 container_of(timer, struct tick_sched, sched_timer);
803 struct pt_regs *regs = get_irq_regs(); 822 struct pt_regs *regs = get_irq_regs();
804 ktime_t now = ktime_get(); 823 ktime_t now = ktime_get();
805 int cpu = smp_processor_id();
806 824
807#ifdef CONFIG_NO_HZ 825 tick_sched_do_timer(now);
808 /*
809 * Check if the do_timer duty was dropped. We don't care about
810 * concurrency: This happens only when the cpu in charge went
811 * into a long sleep. If two cpus happen to assign themself to
812 * this duty, then the jiffies update is still serialized by
813 * xtime_lock.
814 */
815 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
816 tick_do_timer_cpu = cpu;
817#endif
818
819 /* Check, if the jiffies need an update */
820 if (tick_do_timer_cpu == cpu)
821 tick_do_update_jiffies64(now);
822 826
823 /* 827 /*
824 * Do not call, when we are not in irq context and have 828 * Do not call, when we are not in irq context and have
825 * no valid regs pointer 829 * no valid regs pointer
826 */ 830 */
827 if (regs) { 831 if (regs)
828 /* 832 tick_sched_handle(ts, regs);
829 * When we are idle and the tick is stopped, we have to touch
830 * the watchdog as we might not schedule for a really long
831 * time. This happens on complete idle SMP systems while
832 * waiting on the login prompt. We also increment the "start of
833 * idle" jiffy stamp so the idle accounting adjustment we do
834 * when we go busy again does not account too much ticks.
835 */
836 if (ts->tick_stopped) {
837 touch_softlockup_watchdog();
838 if (idle_cpu(cpu))
839 ts->idle_jiffies++;
840 }
841 update_process_times(user_mode(regs));
842 profile_tick(CPU_PROFILING);
843 }
844 833
845 hrtimer_forward(timer, now, tick_period); 834 hrtimer_forward(timer, now, tick_period);
846 835
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void)
874 /* Get the next period (per cpu) */ 863 /* Get the next period (per cpu) */
875 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 864 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
876 865
877 /* Offset the tick to avert xtime_lock contention. */ 866 /* Offset the tick to avert jiffies_lock contention. */
878 if (sched_skew_tick) { 867 if (sched_skew_tick) {
879 u64 offset = ktime_to_ns(tick_period) >> 1; 868 u64 offset = ktime_to_ns(tick_period) >> 1;
880 do_div(offset, num_possible_cpus()); 869 do_div(offset, num_possible_cpus());
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925ce..000000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/math64.h>
24#include <linux/kernel.h>
25
26/*
27 * fixed point arithmetic scale factor for skew
28 *
29 * Usually one would measure skew in ppb (parts per billion, 1e9), but
30 * using a factor of 2 simplifies the math.
31 */
32#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
33
34ktime_t timecompare_transform(struct timecompare *sync,
35 u64 source_tstamp)
36{
37 u64 nsec;
38
39 nsec = source_tstamp + sync->offset;
40 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
41 TIMECOMPARE_SKEW_RESOLUTION;
42
43 return ns_to_ktime(nsec);
44}
45EXPORT_SYMBOL_GPL(timecompare_transform);
46
47int timecompare_offset(struct timecompare *sync,
48 s64 *offset,
49 u64 *source_tstamp)
50{
51 u64 start_source = 0, end_source = 0;
52 struct {
53 s64 offset;
54 s64 duration_target;
55 } buffer[10], sample, *samples;
56 int counter = 0, i;
57 int used;
58 int index;
59 int num_samples = sync->num_samples;
60
61 if (num_samples > ARRAY_SIZE(buffer)) {
62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
63 if (!samples) {
64 samples = buffer;
65 num_samples = ARRAY_SIZE(buffer);
66 }
67 } else {
68 samples = buffer;
69 }
70
71 /* run until we have enough valid samples, but do not try forever */
72 i = 0;
73 counter = 0;
74 while (1) {
75 u64 ts;
76 ktime_t start, end;
77
78 start = sync->target();
79 ts = timecounter_read(sync->source);
80 end = sync->target();
81
82 if (!i)
83 start_source = ts;
84
85 /* ignore negative durations */
86 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
87 if (sample.duration_target >= 0) {
88 /*
89 * assume symetric delay to and from source:
90 * average target time corresponds to measured
91 * source time
92 */
93 sample.offset =
94 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
95 ts;
96
97 /* simple insertion sort based on duration */
98 index = counter - 1;
99 while (index >= 0) {
100 if (samples[index].duration_target <
101 sample.duration_target)
102 break;
103 samples[index + 1] = samples[index];
104 index--;
105 }
106 samples[index + 1] = sample;
107 counter++;
108 }
109
110 i++;
111 if (counter >= num_samples || i >= 100000) {
112 end_source = ts;
113 break;
114 }
115 }
116
117 *source_tstamp = (end_source + start_source) / 2;
118
119 /* remove outliers by only using 75% of the samples */
120 used = counter * 3 / 4;
121 if (!used)
122 used = counter;
123 if (used) {
124 /* calculate average */
125 s64 off = 0;
126 for (index = 0; index < used; index++)
127 off += samples[index].offset;
128 *offset = div_s64(off, used);
129 }
130
131 if (samples && samples != buffer)
132 kfree(samples);
133
134 return used;
135}
136EXPORT_SYMBOL_GPL(timecompare_offset);
137
138void __timecompare_update(struct timecompare *sync,
139 u64 source_tstamp)
140{
141 s64 offset;
142 u64 average_time;
143
144 if (!timecompare_offset(sync, &offset, &average_time))
145 return;
146
147 if (!sync->last_update) {
148 sync->last_update = average_time;
149 sync->offset = offset;
150 sync->skew = 0;
151 } else {
152 s64 delta_nsec = average_time - sync->last_update;
153
154 /* avoid division by negative or small deltas */
155 if (delta_nsec >= 10000) {
156 s64 delta_offset_nsec = offset - sync->offset;
157 s64 skew; /* delta_offset_nsec *
158 TIMECOMPARE_SKEW_RESOLUTION /
159 delta_nsec */
160 u64 divisor;
161
162 /* div_s64() is limited to 32 bit divisor */
163 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
164 divisor = delta_nsec;
165 while (unlikely(divisor >= ((s64)1) << 32)) {
166 /* divide both by 2; beware, right shift
167 of negative value has undefined
168 behavior and can only be used for
169 the positive divisor */
170 skew = div_s64(skew, 2);
171 divisor >>= 1;
172 }
173 skew = div_s64(skew, divisor);
174
175 /*
176 * Calculate new overall skew as 4/16 the
177 * old value and 12/16 the new one. This is
178 * a rather arbitrary tradeoff between
179 * only using the latest measurement (0/16 and
180 * 16/16) and even more weight on past measurements.
181 */
182#define TIMECOMPARE_NEW_SKEW_PER_16 12
183 sync->skew =
184 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
185 sync->skew +
186 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
187 16);
188 sync->last_update = average_time;
189 sync->offset = offset;
190 }
191 }
192}
193EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d3b91e75cecd..cbc6acb0db3f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/timekeeper_internal.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
13#include <linux/percpu.h> 14#include <linux/percpu.h>
@@ -20,71 +21,11 @@
20#include <linux/time.h> 21#include <linux/time.h>
21#include <linux/tick.h> 22#include <linux/tick.h>
22#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
23 25
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
30 /* The shift value of the current clocksource. */
31 u32 shift;
32 /* Number of clock cycles in one NTP interval. */
33 cycle_t cycle_interval;
34 /* Number of clock shifted nano seconds in one NTP interval. */
35 u64 xtime_interval;
36 /* shifted nano seconds left over when rounding cycle_interval */
37 s64 xtime_remainder;
38 /* Raw nano seconds accumulated per NTP interval. */
39 u32 raw_interval;
40
41 /* Current CLOCK_REALTIME time in seconds */
42 u64 xtime_sec;
43 /* Clock shifted nano seconds */
44 u64 xtime_nsec;
45
46 /* Difference between accumulated time and NTP time in ntp
47 * shifted nano seconds. */
48 s64 ntp_error;
49 /* Shift conversion between clock shifted nano seconds and
50 * ntp shifted nano seconds. */
51 u32 ntp_error_shift;
52
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* Offset clock monotonic -> clock realtime */
69 ktime_t offs_real;
70 /* time spent in suspend */
71 struct timespec total_sleep_time;
72 /* Offset clock monotonic -> clock boottime */
73 ktime_t offs_boot;
74 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
75 struct timespec raw_time;
76 /* Seqlock for all timekeeper values */
77 seqlock_t lock;
78};
79 26
80static struct timekeeper timekeeper; 27static struct timekeeper timekeeper;
81 28
82/*
83 * This read-write spinlock protects us from races in SMP while
84 * playing with xtime.
85 */
86__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
87
88/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
89int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
90 31
@@ -96,15 +37,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
96 } 37 }
97} 38}
98 39
99static struct timespec tk_xtime(struct timekeeper *tk)
100{
101 struct timespec ts;
102
103 ts.tv_sec = tk->xtime_sec;
104 ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
105 return ts;
106}
107
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 40static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{ 41{
110 tk->xtime_sec = ts->tv_sec; 42 tk->xtime_sec = ts->tv_sec;
@@ -243,17 +175,63 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
243 return nsec + arch_gettimeoffset(); 175 return nsec + arch_gettimeoffset();
244} 176}
245 177
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
179
180static void update_pvclock_gtod(struct timekeeper *tk)
181{
182 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
183}
184
185/**
186 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
187 *
188 * Must hold write on timekeeper.lock
189 */
190int pvclock_gtod_register_notifier(struct notifier_block *nb)
191{
192 struct timekeeper *tk = &timekeeper;
193 unsigned long flags;
194 int ret;
195
196 write_seqlock_irqsave(&tk->lock, flags);
197 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
198 /* update timekeeping data */
199 update_pvclock_gtod(tk);
200 write_sequnlock_irqrestore(&tk->lock, flags);
201
202 return ret;
203}
204EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
205
206/**
207 * pvclock_gtod_unregister_notifier - unregister a pvclock
208 * timedata update listener
209 *
210 * Must hold write on timekeeper.lock
211 */
212int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
213{
214 struct timekeeper *tk = &timekeeper;
215 unsigned long flags;
216 int ret;
217
218 write_seqlock_irqsave(&tk->lock, flags);
219 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
220 write_sequnlock_irqrestore(&tk->lock, flags);
221
222 return ret;
223}
224EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
225
246/* must hold write on timekeeper.lock */ 226/* must hold write on timekeeper.lock */
247static void timekeeping_update(struct timekeeper *tk, bool clearntp) 227static void timekeeping_update(struct timekeeper *tk, bool clearntp)
248{ 228{
249 struct timespec xt;
250
251 if (clearntp) { 229 if (clearntp) {
252 tk->ntp_error = 0; 230 tk->ntp_error = 0;
253 ntp_clear(); 231 ntp_clear();
254 } 232 }
255 xt = tk_xtime(tk); 233 update_vsyscall(tk);
256 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); 234 update_pvclock_gtod(tk);
257} 235}
258 236
259/** 237/**
@@ -776,6 +754,7 @@ static void timekeeping_resume(void)
776 754
777 read_persistent_clock(&ts); 755 read_persistent_clock(&ts);
778 756
757 clockevents_resume();
779 clocksource_resume(); 758 clocksource_resume();
780 759
781 write_seqlock_irqsave(&tk->lock, flags); 760 write_seqlock_irqsave(&tk->lock, flags);
@@ -835,6 +814,7 @@ static int timekeeping_suspend(void)
835 814
836 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 815 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
837 clocksource_suspend(); 816 clocksource_suspend();
817 clockevents_suspend();
838 818
839 return 0; 819 return 0;
840} 820}
@@ -1111,7 +1091,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1111 accumulate_nsecs_to_secs(tk); 1091 accumulate_nsecs_to_secs(tk);
1112 1092
1113 /* Accumulate raw time */ 1093 /* Accumulate raw time */
1114 raw_nsecs = tk->raw_interval << shift; 1094 raw_nsecs = (u64)tk->raw_interval << shift;
1115 raw_nsecs += tk->raw_time.tv_nsec; 1095 raw_nsecs += tk->raw_time.tv_nsec;
1116 if (raw_nsecs >= NSEC_PER_SEC) { 1096 if (raw_nsecs >= NSEC_PER_SEC) {
1117 u64 raw_secs = raw_nsecs; 1097 u64 raw_secs = raw_nsecs;
@@ -1128,6 +1108,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1128 return offset; 1108 return offset;
1129} 1109}
1130 1110
1111#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1112static inline void old_vsyscall_fixup(struct timekeeper *tk)
1113{
1114 s64 remainder;
1115
1116 /*
1117 * Store only full nanoseconds into xtime_nsec after rounding
1118 * it up and add the remainder to the error difference.
1119 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1120 * by truncating the remainder in vsyscalls. However, it causes
1121 * additional work to be done in timekeeping_adjust(). Once
1122 * the vsyscall implementations are converted to use xtime_nsec
1123 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1124 * users are removed, this can be killed.
1125 */
1126 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1127 tk->xtime_nsec -= remainder;
1128 tk->xtime_nsec += 1ULL << tk->shift;
1129 tk->ntp_error += remainder << tk->ntp_error_shift;
1130
1131}
1132#else
1133#define old_vsyscall_fixup(tk)
1134#endif
1135
1136
1137
1131/** 1138/**
1132 * update_wall_time - Uses the current clocksource to increment the wall time 1139 * update_wall_time - Uses the current clocksource to increment the wall time
1133 * 1140 *
@@ -1139,7 +1146,6 @@ static void update_wall_time(void)
1139 cycle_t offset; 1146 cycle_t offset;
1140 int shift = 0, maxshift; 1147 int shift = 0, maxshift;
1141 unsigned long flags; 1148 unsigned long flags;
1142 s64 remainder;
1143 1149
1144 write_seqlock_irqsave(&tk->lock, flags); 1150 write_seqlock_irqsave(&tk->lock, flags);
1145 1151
@@ -1181,20 +1187,11 @@ static void update_wall_time(void)
1181 /* correct the clock when NTP error is too big */ 1187 /* correct the clock when NTP error is too big */
1182 timekeeping_adjust(tk, offset); 1188 timekeeping_adjust(tk, offset);
1183 1189
1184
1185 /* 1190 /*
1186 * Store only full nanoseconds into xtime_nsec after rounding 1191 * XXX This can be killed once everyone converts
1187 * it up and add the remainder to the error difference. 1192 * to the new update_vsyscall.
1188 * XXX - This is necessary to avoid small 1ns inconsistnecies caused 1193 */
1189 * by truncating the remainder in vsyscalls. However, it causes 1194 old_vsyscall_fixup(tk);
1190 * additional work to be done in timekeeping_adjust(). Once
1191 * the vsyscall implementations are converted to use xtime_nsec
1192 * (shifted nanoseconds), this can be killed.
1193 */
1194 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1195 tk->xtime_nsec -= remainder;
1196 tk->xtime_nsec += 1ULL << tk->shift;
1197 tk->ntp_error += remainder << tk->ntp_error_shift;
1198 1195
1199 /* 1196 /*
1200 * Finally, make sure that after the rounding 1197 * Finally, make sure that after the rounding
@@ -1346,9 +1343,7 @@ struct timespec get_monotonic_coarse(void)
1346} 1343}
1347 1344
1348/* 1345/*
1349 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1346 * Must hold jiffies_lock
1350 * without sampling the sequence number in xtime_lock.
1351 * jiffies is defined in the linker script...
1352 */ 1347 */
1353void do_timer(unsigned long ticks) 1348void do_timer(unsigned long ticks)
1354{ 1349{
@@ -1436,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1436 */ 1431 */
1437void xtime_update(unsigned long ticks) 1432void xtime_update(unsigned long ticks)
1438{ 1433{
1439 write_seqlock(&xtime_lock); 1434 write_seqlock(&jiffies_lock);
1440 do_timer(ticks); 1435 do_timer(ticks);
1441 write_sequnlock(&xtime_lock); 1436 write_sequnlock(&jiffies_lock);
1442} 1437}
diff --git a/kernel/timer.c b/kernel/timer.c
index 8c5e7b908c68..367d00858482 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
63#define TVR_SIZE (1 << TVR_BITS) 63#define TVR_SIZE (1 << TVR_BITS)
64#define TVN_MASK (TVN_SIZE - 1) 64#define TVN_MASK (TVN_SIZE - 1)
65#define TVR_MASK (TVR_SIZE - 1) 65#define TVR_MASK (TVR_SIZE - 1)
66#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
66 67
67struct tvec { 68struct tvec {
68 struct list_head vec[TVN_SIZE]; 69 struct list_head vec[TVN_SIZE];
@@ -92,24 +93,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
92/* Functions below help us manage 'deferrable' flag */ 93/* Functions below help us manage 'deferrable' flag */
93static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 94static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
94{ 95{
95 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 96 return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
96} 97}
97 98
98static inline struct tvec_base *tbase_get_base(struct tvec_base *base) 99static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
99{ 100{
100 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 101 return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
101} 102}
102 103
103static inline void timer_set_deferrable(struct timer_list *timer) 104static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
104{ 105{
105 timer->base = TBASE_MAKE_DEFERRED(timer->base); 106 return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
106} 107}
107 108
108static inline void 109static inline void
109timer_set_base(struct timer_list *timer, struct tvec_base *new_base) 110timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
110{ 111{
111 timer->base = (struct tvec_base *)((unsigned long)(new_base) | 112 unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
112 tbase_get_deferrable(timer->base)); 113
114 timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
113} 115}
114 116
115static unsigned long round_jiffies_common(unsigned long j, int cpu, 117static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -358,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
358 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 360 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
359 } else { 361 } else {
360 int i; 362 int i;
361 /* If the timeout is larger than 0xffffffff on 64-bit 363 /* If the timeout is larger than MAX_TVAL (on 64-bit
362 * architectures then we use the maximum timeout: 364 * architectures or with CONFIG_BASE_SMALL=1) then we
365 * use the maximum timeout.
363 */ 366 */
364 if (idx > 0xffffffffUL) { 367 if (idx > MAX_TVAL) {
365 idx = 0xffffffffUL; 368 idx = MAX_TVAL;
366 expires = idx + base->timer_jiffies; 369 expires = idx + base->timer_jiffies;
367 } 370 }
368 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 371 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
@@ -563,16 +566,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
563 debug_object_assert_init(timer, &timer_debug_descr); 566 debug_object_assert_init(timer, &timer_debug_descr);
564} 567}
565 568
566static void __init_timer(struct timer_list *timer, 569static void do_init_timer(struct timer_list *timer, unsigned int flags,
567 const char *name, 570 const char *name, struct lock_class_key *key);
568 struct lock_class_key *key);
569 571
570void init_timer_on_stack_key(struct timer_list *timer, 572void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
571 const char *name, 573 const char *name, struct lock_class_key *key)
572 struct lock_class_key *key)
573{ 574{
574 debug_object_init_on_stack(timer, &timer_debug_descr); 575 debug_object_init_on_stack(timer, &timer_debug_descr);
575 __init_timer(timer, name, key); 576 do_init_timer(timer, flags, name, key);
576} 577}
577EXPORT_SYMBOL_GPL(init_timer_on_stack_key); 578EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
578 579
@@ -613,12 +614,13 @@ static inline void debug_assert_init(struct timer_list *timer)
613 debug_timer_assert_init(timer); 614 debug_timer_assert_init(timer);
614} 615}
615 616
616static void __init_timer(struct timer_list *timer, 617static void do_init_timer(struct timer_list *timer, unsigned int flags,
617 const char *name, 618 const char *name, struct lock_class_key *key)
618 struct lock_class_key *key)
619{ 619{
620 struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
621
620 timer->entry.next = NULL; 622 timer->entry.next = NULL;
621 timer->base = __raw_get_cpu_var(tvec_bases); 623 timer->base = (void *)((unsigned long)base | flags);
622 timer->slack = -1; 624 timer->slack = -1;
623#ifdef CONFIG_TIMER_STATS 625#ifdef CONFIG_TIMER_STATS
624 timer->start_site = NULL; 626 timer->start_site = NULL;
@@ -628,22 +630,10 @@ static void __init_timer(struct timer_list *timer,
628 lockdep_init_map(&timer->lockdep_map, name, key, 0); 630 lockdep_init_map(&timer->lockdep_map, name, key, 0);
629} 631}
630 632
631void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
632 const char *name,
633 struct lock_class_key *key,
634 void (*function)(unsigned long),
635 unsigned long data)
636{
637 timer->function = function;
638 timer->data = data;
639 init_timer_on_stack_key(timer, name, key);
640 timer_set_deferrable(timer);
641}
642EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
643
644/** 633/**
645 * init_timer_key - initialize a timer 634 * init_timer_key - initialize a timer
646 * @timer: the timer to be initialized 635 * @timer: the timer to be initialized
636 * @flags: timer flags
647 * @name: name of the timer 637 * @name: name of the timer
648 * @key: lockdep class key of the fake lock used for tracking timer 638 * @key: lockdep class key of the fake lock used for tracking timer
649 * sync lock dependencies 639 * sync lock dependencies
@@ -651,24 +641,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
651 * init_timer_key() must be done to a timer prior calling *any* of the 641 * init_timer_key() must be done to a timer prior calling *any* of the
652 * other timer functions. 642 * other timer functions.
653 */ 643 */
654void init_timer_key(struct timer_list *timer, 644void init_timer_key(struct timer_list *timer, unsigned int flags,
655 const char *name, 645 const char *name, struct lock_class_key *key)
656 struct lock_class_key *key)
657{ 646{
658 debug_init(timer); 647 debug_init(timer);
659 __init_timer(timer, name, key); 648 do_init_timer(timer, flags, name, key);
660} 649}
661EXPORT_SYMBOL(init_timer_key); 650EXPORT_SYMBOL(init_timer_key);
662 651
663void init_timer_deferrable_key(struct timer_list *timer,
664 const char *name,
665 struct lock_class_key *key)
666{
667 init_timer_key(timer, name, key);
668 timer_set_deferrable(timer);
669}
670EXPORT_SYMBOL(init_timer_deferrable_key);
671
672static inline void detach_timer(struct timer_list *timer, bool clear_pending) 652static inline void detach_timer(struct timer_list *timer, bool clear_pending)
673{ 653{
674 struct list_head *entry = &timer->entry; 654 struct list_head *entry = &timer->entry;
@@ -686,7 +666,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
686{ 666{
687 detach_timer(timer, true); 667 detach_timer(timer, true);
688 if (!tbase_get_deferrable(timer->base)) 668 if (!tbase_get_deferrable(timer->base))
689 timer->base->active_timers--; 669 base->active_timers--;
690} 670}
691 671
692static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, 672static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -697,7 +677,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
697 677
698 detach_timer(timer, clear_pending); 678 detach_timer(timer, clear_pending);
699 if (!tbase_get_deferrable(timer->base)) { 679 if (!tbase_get_deferrable(timer->base)) {
700 timer->base->active_timers--; 680 base->active_timers--;
701 if (timer->expires == base->next_timer) 681 if (timer->expires == base->next_timer)
702 base->next_timer = base->timer_jiffies; 682 base->next_timer = base->timer_jiffies;
703 } 683 }
@@ -1029,14 +1009,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
1029 * 1009 *
1030 * Synchronization rules: Callers must prevent restarting of the timer, 1010 * Synchronization rules: Callers must prevent restarting of the timer,
1031 * otherwise this function is meaningless. It must not be called from 1011 * otherwise this function is meaningless. It must not be called from
1032 * interrupt contexts. The caller must not hold locks which would prevent 1012 * interrupt contexts unless the timer is an irqsafe one. The caller must
1033 * completion of the timer's handler. The timer's handler must not call 1013 * not hold locks which would prevent completion of the timer's
1034 * add_timer_on(). Upon exit the timer is not queued and the handler is 1014 * handler. The timer's handler must not call add_timer_on(). Upon exit the
1035 * not running on any CPU. 1015 * timer is not queued and the handler is not running on any CPU.
1036 * 1016 *
1037 * Note: You must not hold locks that are held in interrupt context 1017 * Note: For !irqsafe timers, you must not hold locks that are held in
1038 * while calling this function. Even if the lock has nothing to do 1018 * interrupt context while calling this function. Even if the lock has
1039 * with the timer in question. Here's why: 1019 * nothing to do with the timer in question. Here's why:
1040 * 1020 *
1041 * CPU0 CPU1 1021 * CPU0 CPU1
1042 * ---- ---- 1022 * ---- ----
@@ -1073,7 +1053,7 @@ int del_timer_sync(struct timer_list *timer)
1073 * don't use it in hardirq context, because it 1053 * don't use it in hardirq context, because it
1074 * could lead to deadlock. 1054 * could lead to deadlock.
1075 */ 1055 */
1076 WARN_ON(in_irq()); 1056 WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
1077 for (;;) { 1057 for (;;) {
1078 int ret = try_to_del_timer_sync(timer); 1058 int ret = try_to_del_timer_sync(timer);
1079 if (ret >= 0) 1059 if (ret >= 0)
@@ -1180,19 +1160,27 @@ static inline void __run_timers(struct tvec_base *base)
1180 while (!list_empty(head)) { 1160 while (!list_empty(head)) {
1181 void (*fn)(unsigned long); 1161 void (*fn)(unsigned long);
1182 unsigned long data; 1162 unsigned long data;
1163 bool irqsafe;
1183 1164
1184 timer = list_first_entry(head, struct timer_list,entry); 1165 timer = list_first_entry(head, struct timer_list,entry);
1185 fn = timer->function; 1166 fn = timer->function;
1186 data = timer->data; 1167 data = timer->data;
1168 irqsafe = tbase_get_irqsafe(timer->base);
1187 1169
1188 timer_stats_account_timer(timer); 1170 timer_stats_account_timer(timer);
1189 1171
1190 base->running_timer = timer; 1172 base->running_timer = timer;
1191 detach_expired_timer(timer, base); 1173 detach_expired_timer(timer, base);
1192 1174
1193 spin_unlock_irq(&base->lock); 1175 if (irqsafe) {
1194 call_timer_fn(timer, fn, data); 1176 spin_unlock(&base->lock);
1195 spin_lock_irq(&base->lock); 1177 call_timer_fn(timer, fn, data);
1178 spin_lock(&base->lock);
1179 } else {
1180 spin_unlock_irq(&base->lock);
1181 call_timer_fn(timer, fn, data);
1182 spin_lock_irq(&base->lock);
1183 }
1196 } 1184 }
1197 } 1185 }
1198 base->running_timer = NULL; 1186 base->running_timer = NULL;
@@ -1791,9 +1779,13 @@ static struct notifier_block __cpuinitdata timers_nb = {
1791 1779
1792void __init init_timers(void) 1780void __init init_timers(void)
1793{ 1781{
1794 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1782 int err;
1795 (void *)(long)smp_processor_id()); 1783
1784 /* ensure there are enough low bits for flags in timer->base pointer */
1785 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1796 1786
1787 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1788 (void *)(long)smp_processor_id());
1797 init_timer_stats(); 1789 init_timer_stats();
1798 1790
1799 BUG_ON(err != NOTIFY_OK); 1791 BUG_ON(err != NOTIFY_OK);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8c4c07071cc5..5d89335a485f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_FENTRY
53 bool
54 help
55 Arch supports the gcc options -pg with -mfentry
56
52config HAVE_C_RECORDMCOUNT 57config HAVE_C_RECORDMCOUNT
53 bool 58 bool
54 help 59 help
@@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT
57config TRACER_MAX_TRACE 62config TRACER_MAX_TRACE
58 bool 63 bool
59 64
65config TRACE_CLOCK
66 bool
67
60config RING_BUFFER 68config RING_BUFFER
61 bool 69 bool
70 select TRACE_CLOCK
62 71
63config FTRACE_NMI_ENTER 72config FTRACE_NMI_ENTER
64 bool 73 bool
@@ -109,6 +118,8 @@ config TRACING
109 select NOP_TRACER 118 select NOP_TRACER
110 select BINARY_PRINTF 119 select BINARY_PRINTF
111 select EVENT_TRACING 120 select EVENT_TRACING
121 select TRACE_CLOCK
122 select IRQ_WORK
112 123
113config GENERIC_TRACER 124config GENERIC_TRACER
114 bool 125 bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b831087c8200..d7e2068e4b71 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST
8# selftest needs instrumentation 9# selftest needs instrumentation
9CFLAGS_trace_selftest_dynamic.o = -pg 10CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 11obj-y += trace_selftest_dynamic.o
11endif 12endif
13endif
12 14
13# If unlikely tracing is enabled, do not trace these files 15# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES 16ifdef CONFIG_TRACING_BRANCHES
@@ -17,11 +19,7 @@ endif
17 19
18CFLAGS_trace_events_filter.o := -I$(src) 20CFLAGS_trace_events_filter.o := -I$(src)
19 21
20# 22obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
21# Make the trace clocks available generally: it's infrastructure
22# relied on by ptrace for example:
23#
24obj-y += trace_clock.o
25 23
26obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
27obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b4f20fba09fc..3ffe4c5ad3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III 13 * Copyright (C) 2004 Nadia Yvette Chambers
14 */ 14 */
15 15
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
@@ -64,12 +64,20 @@
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66 66
67static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
70};
71
67/* ftrace_enabled is a method to turn ftrace on or off */ 72/* ftrace_enabled is a method to turn ftrace on or off */
68int ftrace_enabled __read_mostly; 73int ftrace_enabled __read_mostly;
69static int last_ftrace_enabled; 74static int last_ftrace_enabled;
70 75
71/* Quick disabling of function tracer. */ 76/* Quick disabling of function tracer. */
72int function_trace_stop; 77int function_trace_stop __read_mostly;
78
79/* Current function tracing op */
80struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
73 81
74/* List for set_ftrace_pid's pids. */ 82/* List for set_ftrace_pid's pids. */
75LIST_HEAD(ftrace_pids); 83LIST_HEAD(ftrace_pids);
@@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly;
86 94
87static DEFINE_MUTEX(ftrace_lock); 95static DEFINE_MUTEX(ftrace_lock);
88 96
89static struct ftrace_ops ftrace_list_end __read_mostly = {
90 .func = ftrace_stub,
91};
92
93static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 97static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
94static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 98static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
95static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 99static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
96ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 100ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
97static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
98ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
99ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 101ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
100static struct ftrace_ops global_ops; 102static struct ftrace_ops global_ops;
101static struct ftrace_ops control_ops; 103static struct ftrace_ops control_ops;
102 104
103static void 105#if ARCH_SUPPORTS_FTRACE_OPS
104ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); 106static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
107 struct ftrace_ops *op, struct pt_regs *regs);
108#else
109/* See comment below, where ftrace_ops_list_func is defined */
110static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif
113
114/**
115 * ftrace_nr_registered_ops - return number of ops registered
116 *
117 * Returns the number of ftrace_ops registered and tracing functions
118 */
119int ftrace_nr_registered_ops(void)
120{
121 struct ftrace_ops *ops;
122 int cnt = 0;
123
124 mutex_lock(&ftrace_lock);
125
126 for (ops = ftrace_ops_list;
127 ops != &ftrace_list_end; ops = ops->next)
128 cnt++;
129
130 mutex_unlock(&ftrace_lock);
131
132 return cnt;
133}
105 134
106/* 135/*
107 * Traverse the ftrace_global_list, invoking all entries. The reason that we 136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
@@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
112 * 141 *
113 * Silly Alpha and silly pointer-speculation compiler optimizations! 142 * Silly Alpha and silly pointer-speculation compiler optimizations!
114 */ 143 */
115static void ftrace_global_list_func(unsigned long ip, 144static void
116 unsigned long parent_ip) 145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs)
117{ 147{
118 struct ftrace_ops *op;
119
120 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
121 return; 149 return;
122 150
123 trace_recursion_set(TRACE_GLOBAL_BIT); 151 trace_recursion_set(TRACE_GLOBAL_BIT);
124 op = rcu_dereference_raw(ftrace_global_list); /*see above*/ 152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
125 while (op != &ftrace_list_end) { 153 while (op != &ftrace_list_end) {
126 op->func(ip, parent_ip); 154 op->func(ip, parent_ip, op, regs);
127 op = rcu_dereference_raw(op->next); /*see above*/ 155 op = rcu_dereference_raw(op->next); /*see above*/
128 }; 156 };
129 trace_recursion_clear(TRACE_GLOBAL_BIT); 157 trace_recursion_clear(TRACE_GLOBAL_BIT);
130} 158}
131 159
132static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) 160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
161 struct ftrace_ops *op, struct pt_regs *regs)
133{ 162{
134 if (!test_tsk_trace_trace(current)) 163 if (!test_tsk_trace_trace(current))
135 return; 164 return;
136 165
137 ftrace_pid_function(ip, parent_ip); 166 ftrace_pid_function(ip, parent_ip, op, regs);
138} 167}
139 168
140static void set_ftrace_pid_function(ftrace_func_t func) 169static void set_ftrace_pid_function(ftrace_func_t func)
@@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func)
153void clear_ftrace_function(void) 182void clear_ftrace_function(void)
154{ 183{
155 ftrace_trace_function = ftrace_stub; 184 ftrace_trace_function = ftrace_stub;
156 __ftrace_trace_function = ftrace_stub;
157 __ftrace_trace_function_delay = ftrace_stub;
158 ftrace_pid_function = ftrace_stub; 185 ftrace_pid_function = ftrace_stub;
159} 186}
160 187
161#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
162/*
163 * For those archs that do not test ftrace_trace_stop in their
164 * mcount call site, we need to do it from C.
165 */
166static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
167{
168 if (function_trace_stop)
169 return;
170
171 __ftrace_trace_function(ip, parent_ip);
172}
173#endif
174
175static void control_ops_disable_all(struct ftrace_ops *ops) 188static void control_ops_disable_all(struct ftrace_ops *ops)
176{ 189{
177 int cpu; 190 int cpu;
@@ -230,28 +243,27 @@ static void update_ftrace_function(void)
230 243
231 /* 244 /*
232 * If we are at the end of the list and this ops is 245 * If we are at the end of the list and this ops is
233 * not dynamic, then have the mcount trampoline call 246 * recursion safe and not dynamic and the arch supports passing ops,
234 * the function directly 247 * then have the mcount trampoline call the function directly.
235 */ 248 */
236 if (ftrace_ops_list == &ftrace_list_end || 249 if (ftrace_ops_list == &ftrace_list_end ||
237 (ftrace_ops_list->next == &ftrace_list_end && 250 (ftrace_ops_list->next == &ftrace_list_end &&
238 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) 251 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
252 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
253 !FTRACE_FORCE_LIST_FUNC)) {
254 /* Set the ftrace_ops that the arch callback uses */
255 if (ftrace_ops_list == &global_ops)
256 function_trace_op = ftrace_global_list;
257 else
258 function_trace_op = ftrace_ops_list;
239 func = ftrace_ops_list->func; 259 func = ftrace_ops_list->func;
240 else 260 } else {
261 /* Just use the default ftrace_ops */
262 function_trace_op = &ftrace_list_end;
241 func = ftrace_ops_list_func; 263 func = ftrace_ops_list_func;
264 }
242 265
243#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
244 ftrace_trace_function = func; 266 ftrace_trace_function = func;
245#else
246#ifdef CONFIG_DYNAMIC_FTRACE
247 /* do not update till all functions have been modified */
248 __ftrace_trace_function_delay = func;
249#else
250 __ftrace_trace_function = func;
251#endif
252 ftrace_trace_function =
253 (func == ftrace_stub) ? func : ftrace_test_stop_func;
254#endif
255} 267}
256 268
257static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 269static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
325 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
326 return -EINVAL; 338 return -EINVAL;
327 339
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
341 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
344 * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
345 */
346 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
347 !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
348 return -EINVAL;
349
350 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
351 ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
352#endif
353
328 if (!core_kernel_data((unsigned long)ops)) 354 if (!core_kernel_data((unsigned long)ops))
329 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 355 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
330 356
@@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
773} 799}
774 800
775static void 801static void
776function_profile_call(unsigned long ip, unsigned long parent_ip) 802function_profile_call(unsigned long ip, unsigned long parent_ip,
803 struct ftrace_ops *ops, struct pt_regs *regs)
777{ 804{
778 struct ftrace_profile_stat *stat; 805 struct ftrace_profile_stat *stat;
779 struct ftrace_profile *rec; 806 struct ftrace_profile *rec;
@@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
803#ifdef CONFIG_FUNCTION_GRAPH_TRACER 830#ifdef CONFIG_FUNCTION_GRAPH_TRACER
804static int profile_graph_entry(struct ftrace_graph_ent *trace) 831static int profile_graph_entry(struct ftrace_graph_ent *trace)
805{ 832{
806 function_profile_call(trace->func, 0); 833 function_profile_call(trace->func, 0, NULL, NULL);
807 return 1; 834 return 1;
808} 835}
809 836
@@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void)
863#else 890#else
864static struct ftrace_ops ftrace_profile_ops __read_mostly = { 891static struct ftrace_ops ftrace_profile_ops __read_mostly = {
865 .func = function_profile_call, 892 .func = function_profile_call,
893 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
866}; 894};
867 895
868static int register_ftrace_profiler(void) 896static int register_ftrace_profiler(void)
@@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = {
1045 .func = ftrace_stub, 1073 .func = ftrace_stub,
1046 .notrace_hash = EMPTY_HASH, 1074 .notrace_hash = EMPTY_HASH,
1047 .filter_hash = EMPTY_HASH, 1075 .filter_hash = EMPTY_HASH,
1076 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
1048}; 1077};
1049 1078
1050static DEFINE_MUTEX(ftrace_regex_lock); 1079static DEFINE_MUTEX(ftrace_regex_lock);
@@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1525 rec->flags++; 1554 rec->flags++;
1526 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) 1555 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1527 return; 1556 return;
1557 /*
1558 * If any ops wants regs saved for this function
1559 * then all ops will get saved regs.
1560 */
1561 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1562 rec->flags |= FTRACE_FL_REGS;
1528 } else { 1563 } else {
1529 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) 1564 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1530 return; 1565 return;
@@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1616 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1651 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1617 flag = FTRACE_FL_ENABLED; 1652 flag = FTRACE_FL_ENABLED;
1618 1653
1654 /*
1655 * If enabling and the REGS flag does not match the REGS_EN, then
1656 * do not ignore this record. Set flags to fail the compare against
1657 * ENABLED.
1658 */
1659 if (flag &&
1660 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
1661 flag |= FTRACE_FL_REGS;
1662
1619 /* If the state of this record hasn't changed, then do nothing */ 1663 /* If the state of this record hasn't changed, then do nothing */
1620 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1664 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1621 return FTRACE_UPDATE_IGNORE; 1665 return FTRACE_UPDATE_IGNORE;
1622 1666
1623 if (flag) { 1667 if (flag) {
1624 if (update) 1668 /* Save off if rec is being enabled (for return value) */
1669 flag ^= rec->flags & FTRACE_FL_ENABLED;
1670
1671 if (update) {
1625 rec->flags |= FTRACE_FL_ENABLED; 1672 rec->flags |= FTRACE_FL_ENABLED;
1626 return FTRACE_UPDATE_MAKE_CALL; 1673 if (flag & FTRACE_FL_REGS) {
1674 if (rec->flags & FTRACE_FL_REGS)
1675 rec->flags |= FTRACE_FL_REGS_EN;
1676 else
1677 rec->flags &= ~FTRACE_FL_REGS_EN;
1678 }
1679 }
1680
1681 /*
1682 * If this record is being updated from a nop, then
1683 * return UPDATE_MAKE_CALL.
1684 * Otherwise, if the EN flag is set, then return
1685 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1686 * from the non-save regs, to a save regs function.
1687 * Otherwise,
1688 * return UPDATE_MODIFY_CALL to tell the caller to convert
1689 * from the save regs, to a non-save regs function.
1690 */
1691 if (flag & FTRACE_FL_ENABLED)
1692 return FTRACE_UPDATE_MAKE_CALL;
1693 else if (rec->flags & FTRACE_FL_REGS_EN)
1694 return FTRACE_UPDATE_MODIFY_CALL_REGS;
1695 else
1696 return FTRACE_UPDATE_MODIFY_CALL;
1627 } 1697 }
1628 1698
1629 if (update) 1699 if (update) {
1630 rec->flags &= ~FTRACE_FL_ENABLED; 1700 /* If there's no more users, clear all flags */
1701 if (!(rec->flags & ~FTRACE_FL_MASK))
1702 rec->flags = 0;
1703 else
1704 /* Just disable the record (keep REGS state) */
1705 rec->flags &= ~FTRACE_FL_ENABLED;
1706 }
1631 1707
1632 return FTRACE_UPDATE_MAKE_NOP; 1708 return FTRACE_UPDATE_MAKE_NOP;
1633} 1709}
@@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1662static int 1738static int
1663__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1739__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1664{ 1740{
1741 unsigned long ftrace_old_addr;
1665 unsigned long ftrace_addr; 1742 unsigned long ftrace_addr;
1666 int ret; 1743 int ret;
1667 1744
1668 ftrace_addr = (unsigned long)FTRACE_ADDR;
1669
1670 ret = ftrace_update_record(rec, enable); 1745 ret = ftrace_update_record(rec, enable);
1671 1746
1747 if (rec->flags & FTRACE_FL_REGS)
1748 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
1749 else
1750 ftrace_addr = (unsigned long)FTRACE_ADDR;
1751
1672 switch (ret) { 1752 switch (ret) {
1673 case FTRACE_UPDATE_IGNORE: 1753 case FTRACE_UPDATE_IGNORE:
1674 return 0; 1754 return 0;
@@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1678 1758
1679 case FTRACE_UPDATE_MAKE_NOP: 1759 case FTRACE_UPDATE_MAKE_NOP:
1680 return ftrace_make_nop(NULL, rec, ftrace_addr); 1760 return ftrace_make_nop(NULL, rec, ftrace_addr);
1761
1762 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1763 case FTRACE_UPDATE_MODIFY_CALL:
1764 if (rec->flags & FTRACE_FL_REGS)
1765 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1766 else
1767 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1768
1769 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1681 } 1770 }
1682 1771
1683 return -1; /* unknow ftrace bug */ 1772 return -1; /* unknow ftrace bug */
@@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command)
1882 */ 1971 */
1883 arch_ftrace_update_code(command); 1972 arch_ftrace_update_code(command);
1884 1973
1885#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1886 /*
1887 * For archs that call ftrace_test_stop_func(), we must
1888 * wait till after we update all the function callers
1889 * before we update the callback. This keeps different
1890 * ops that record different functions from corrupting
1891 * each other.
1892 */
1893 __ftrace_trace_function = __ftrace_trace_function_delay;
1894#endif
1895 function_trace_stop--; 1974 function_trace_stop--;
1896 1975
1897 ret = ftrace_arch_code_modify_post_process(); 1976 ret = ftrace_arch_code_modify_post_process();
@@ -2358,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2358{ 2437{
2359 iter->pos = 0; 2438 iter->pos = 0;
2360 iter->func_pos = 0; 2439 iter->func_pos = 0;
2361 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); 2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
2362} 2441}
2363 2442
2364static void *t_start(struct seq_file *m, loff_t *pos) 2443static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v)
2441 2520
2442 seq_printf(m, "%ps", (void *)rec->ip); 2521 seq_printf(m, "%ps", (void *)rec->ip);
2443 if (iter->flags & FTRACE_ITER_ENABLED) 2522 if (iter->flags & FTRACE_ITER_ENABLED)
2444 seq_printf(m, " (%ld)", 2523 seq_printf(m, " (%ld)%s",
2445 rec->flags & ~FTRACE_FL_MASK); 2524 rec->flags & ~FTRACE_FL_MASK,
2525 rec->flags & FTRACE_FL_REGS ? " R" : "");
2446 seq_printf(m, "\n"); 2526 seq_printf(m, "\n");
2447 2527
2448 return 0; 2528 return 0;
@@ -2595,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2595} 2675}
2596 2676
2597loff_t 2677loff_t
2598ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2678ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2599{ 2679{
2600 loff_t ret; 2680 loff_t ret;
2601 2681
2602 if (file->f_mode & FMODE_READ) 2682 if (file->f_mode & FMODE_READ)
2603 ret = seq_lseek(file, offset, origin); 2683 ret = seq_lseek(file, offset, whence);
2604 else 2684 else
2605 file->f_pos = ret = 1; 2685 file->f_pos = ret = 1;
2606 2686
@@ -2788,10 +2868,10 @@ static int __init ftrace_mod_cmd_init(void)
2788{ 2868{
2789 return register_ftrace_command(&ftrace_mod_cmd); 2869 return register_ftrace_command(&ftrace_mod_cmd);
2790} 2870}
2791device_initcall(ftrace_mod_cmd_init); 2871core_initcall(ftrace_mod_cmd_init);
2792 2872
2793static void 2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2794function_trace_probe_call(unsigned long ip, unsigned long parent_ip) 2874 struct ftrace_ops *op, struct pt_regs *pt_regs)
2795{ 2875{
2796 struct ftrace_func_probe *entry; 2876 struct ftrace_func_probe *entry;
2797 struct hlist_head *hhd; 2877 struct hlist_head *hhd;
@@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
3162} 3242}
3163 3243
3164static int 3244static int
3165ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, 3245ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3166 int reset, int enable) 3246{
3247 struct ftrace_func_entry *entry;
3248
3249 if (!ftrace_location(ip))
3250 return -EINVAL;
3251
3252 if (remove) {
3253 entry = ftrace_lookup_ip(hash, ip);
3254 if (!entry)
3255 return -ENOENT;
3256 free_hash_entry(hash, entry);
3257 return 0;
3258 }
3259
3260 return add_hash_entry(hash, ip);
3261}
3262
3263static int
3264ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3265 unsigned long ip, int remove, int reset, int enable)
3167{ 3266{
3168 struct ftrace_hash **orig_hash; 3267 struct ftrace_hash **orig_hash;
3169 struct ftrace_hash *hash; 3268 struct ftrace_hash *hash;
@@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3192 ret = -EINVAL; 3291 ret = -EINVAL;
3193 goto out_regex_unlock; 3292 goto out_regex_unlock;
3194 } 3293 }
3294 if (ip) {
3295 ret = ftrace_match_addr(hash, ip, remove);
3296 if (ret < 0)
3297 goto out_regex_unlock;
3298 }
3195 3299
3196 mutex_lock(&ftrace_lock); 3300 mutex_lock(&ftrace_lock);
3197 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3301 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3208 return ret; 3312 return ret;
3209} 3313}
3210 3314
3315static int
3316ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
3317 int reset, int enable)
3318{
3319 return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
3320}
3321
3322/**
3323 * ftrace_set_filter_ip - set a function to filter on in ftrace by address
3324 * @ops - the ops to set the filter with
3325 * @ip - the address to add to or remove from the filter.
3326 * @remove - non zero to remove the ip from the filter
3327 * @reset - non zero to reset all filters before applying this filter.
3328 *
3329 * Filters denote which functions should be enabled when tracing is enabled
3330 * If @ip is NULL, it failes to update filter.
3331 */
3332int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
3333 int remove, int reset)
3334{
3335 return ftrace_set_addr(ops, ip, remove, reset, 1);
3336}
3337EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
3338
3339static int
3340ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3341 int reset, int enable)
3342{
3343 return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
3344}
3345
3211/** 3346/**
3212 * ftrace_set_filter - set a function to filter on in ftrace 3347 * ftrace_set_filter - set a function to filter on in ftrace
3213 * @ops - the ops to set the filter with 3348 * @ops - the ops to set the filter with
@@ -3912,6 +4047,7 @@ void __init ftrace_init(void)
3912 4047
3913static struct ftrace_ops global_ops = { 4048static struct ftrace_ops global_ops = {
3914 .func = ftrace_stub, 4049 .func = ftrace_stub,
4050 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
3915}; 4051};
3916 4052
3917static int __init ftrace_nodyn_init(void) 4053static int __init ftrace_nodyn_init(void)
@@ -3919,7 +4055,7 @@ static int __init ftrace_nodyn_init(void)
3919 ftrace_enabled = 1; 4055 ftrace_enabled = 1;
3920 return 0; 4056 return 0;
3921} 4057}
3922device_initcall(ftrace_nodyn_init); 4058core_initcall(ftrace_nodyn_init);
3923 4059
3924static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
3925static inline void ftrace_startup_enable(int command) { } 4061static inline void ftrace_startup_enable(int command) { }
@@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3942#endif /* CONFIG_DYNAMIC_FTRACE */ 4078#endif /* CONFIG_DYNAMIC_FTRACE */
3943 4079
3944static void 4080static void
3945ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) 4081ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4082 struct ftrace_ops *op, struct pt_regs *regs)
3946{ 4083{
3947 struct ftrace_ops *op;
3948
3949 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) 4084 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
3950 return; 4085 return;
3951 4086
@@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
3959 while (op != &ftrace_list_end) { 4094 while (op != &ftrace_list_end) {
3960 if (!ftrace_function_local_disabled(op) && 4095 if (!ftrace_function_local_disabled(op) &&
3961 ftrace_ops_test(op, ip)) 4096 ftrace_ops_test(op, ip))
3962 op->func(ip, parent_ip); 4097 op->func(ip, parent_ip, op, regs);
3963 4098
3964 op = rcu_dereference_raw(op->next); 4099 op = rcu_dereference_raw(op->next);
3965 }; 4100 };
@@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
3969 4104
3970static struct ftrace_ops control_ops = { 4105static struct ftrace_ops control_ops = {
3971 .func = ftrace_ops_control_func, 4106 .func = ftrace_ops_control_func,
4107 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
3972}; 4108};
3973 4109
3974static void 4110static inline void
3975ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) 4111__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs)
3976{ 4113{
3977 struct ftrace_ops *op; 4114 struct ftrace_ops *op;
3978 4115
4116 if (function_trace_stop)
4117 return;
4118
3979 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
3980 return; 4120 return;
3981 4121
@@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3988 op = rcu_dereference_raw(ftrace_ops_list); 4128 op = rcu_dereference_raw(ftrace_ops_list);
3989 while (op != &ftrace_list_end) { 4129 while (op != &ftrace_list_end) {
3990 if (ftrace_ops_test(op, ip)) 4130 if (ftrace_ops_test(op, ip))
3991 op->func(ip, parent_ip); 4131 op->func(ip, parent_ip, op, regs);
3992 op = rcu_dereference_raw(op->next); 4132 op = rcu_dereference_raw(op->next);
3993 }; 4133 };
3994 preempt_enable_notrace(); 4134 preempt_enable_notrace();
3995 trace_recursion_clear(TRACE_INTERNAL_BIT); 4135 trace_recursion_clear(TRACE_INTERNAL_BIT);
3996} 4136}
3997 4137
4138/*
4139 * Some archs only support passing ip and parent_ip. Even though
4140 * the list function ignores the op parameter, we do not want any
4141 * C side effects, where a function is called without the caller
4142 * sending a third parameter.
4143 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */
4151#if ARCH_SUPPORTS_FTRACE_OPS
4152static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4153 struct ftrace_ops *op, struct pt_regs *regs)
4154{
4155 __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
4156}
4157#else
4158static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4159{
4160 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
4161}
4162#endif
4163
3998static void clear_ftrace_swapper(void) 4164static void clear_ftrace_swapper(void)
3999{ 4165{
4000 struct task_struct *p; 4166 struct task_struct *p;
@@ -4215,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
4215 if (strlen(tmp) == 0) 4381 if (strlen(tmp) == 0)
4216 return 1; 4382 return 1;
4217 4383
4218 ret = strict_strtol(tmp, 10, &val); 4384 ret = kstrtol(tmp, 10, &val);
4219 if (ret < 0) 4385 if (ret < 0)
4220 return ret; 4386 return ret;
4221 4387
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 49491fa7daa2..ce8514feedcd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu {
460 unsigned long lost_events; 460 unsigned long lost_events;
461 unsigned long last_overrun; 461 unsigned long last_overrun;
462 local_t entries_bytes; 462 local_t entries_bytes;
463 local_t commit_overrun;
464 local_t overrun;
465 local_t entries; 463 local_t entries;
464 local_t overrun;
465 local_t commit_overrun;
466 local_t dropped_events;
466 local_t committing; 467 local_t committing;
467 local_t commits; 468 local_t commits;
468 unsigned long read; 469 unsigned long read;
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1396 struct list_head *head_page_with_bit; 1397 struct list_head *head_page_with_bit;
1397 1398
1398 head_page = &rb_set_head_page(cpu_buffer)->list; 1399 head_page = &rb_set_head_page(cpu_buffer)->list;
1400 if (!head_page)
1401 break;
1399 prev_page = head_page->prev; 1402 prev_page = head_page->prev;
1400 1403
1401 first_page = pages->next; 1404 first_page = pages->next;
@@ -1567,6 +1570,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1567 1570
1568 put_online_cpus(); 1571 put_online_cpus();
1569 } else { 1572 } else {
1573 /* Make sure this CPU has been intitialized */
1574 if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
1575 goto out;
1576
1570 cpu_buffer = buffer->buffers[cpu_id]; 1577 cpu_buffer = buffer->buffers[cpu_id];
1571 1578
1572 if (nr_pages == cpu_buffer->nr_pages) 1579 if (nr_pages == cpu_buffer->nr_pages)
@@ -1816,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1816} 1823}
1817 1824
1818/** 1825/**
1819 * ring_buffer_update_event - update event type and data 1826 * rb_update_event - update event type and data
1820 * @event: the even to update 1827 * @event: the even to update
1821 * @type: the type of event 1828 * @type: the type of event
1822 * @length: the size of the event field in the ring buffer 1829 * @length: the size of the event field in the ring buffer
@@ -2151,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2151 * If we are not in overwrite mode, 2158 * If we are not in overwrite mode,
2152 * this is easy, just stop here. 2159 * this is easy, just stop here.
2153 */ 2160 */
2154 if (!(buffer->flags & RB_FL_OVERWRITE)) 2161 if (!(buffer->flags & RB_FL_OVERWRITE)) {
2162 local_inc(&cpu_buffer->dropped_events);
2155 goto out_reset; 2163 goto out_reset;
2164 }
2156 2165
2157 ret = rb_handle_head_page(cpu_buffer, 2166 ret = rb_handle_head_page(cpu_buffer,
2158 tail_page, 2167 tail_page,
@@ -2716,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2716 * and not the length of the event which would hold the header. 2725 * and not the length of the event which would hold the header.
2717 */ 2726 */
2718int ring_buffer_write(struct ring_buffer *buffer, 2727int ring_buffer_write(struct ring_buffer *buffer,
2719 unsigned long length, 2728 unsigned long length,
2720 void *data) 2729 void *data)
2721{ 2730{
2722 struct ring_buffer_per_cpu *cpu_buffer; 2731 struct ring_buffer_per_cpu *cpu_buffer;
2723 struct ring_buffer_event *event; 2732 struct ring_buffer_event *event;
@@ -2816,7 +2825,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
2816 * to the buffer after this will fail and return NULL. 2825 * to the buffer after this will fail and return NULL.
2817 * 2826 *
2818 * This is different than ring_buffer_record_disable() as 2827 * This is different than ring_buffer_record_disable() as
2819 * it works like an on/off switch, where as the disable() verison 2828 * it works like an on/off switch, where as the disable() version
2820 * must be paired with a enable(). 2829 * must be paired with a enable().
2821 */ 2830 */
2822void ring_buffer_record_off(struct ring_buffer *buffer) 2831void ring_buffer_record_off(struct ring_buffer *buffer)
@@ -2839,7 +2848,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off);
2839 * ring_buffer_record_off(). 2848 * ring_buffer_record_off().
2840 * 2849 *
2841 * This is different than ring_buffer_record_enable() as 2850 * This is different than ring_buffer_record_enable() as
2842 * it works like an on/off switch, where as the enable() verison 2851 * it works like an on/off switch, where as the enable() version
2843 * must be paired with a disable(). 2852 * must be paired with a disable().
2844 */ 2853 */
2845void ring_buffer_record_on(struct ring_buffer *buffer) 2854void ring_buffer_record_on(struct ring_buffer *buffer)
@@ -2925,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2925 * @buffer: The ring buffer 2934 * @buffer: The ring buffer
2926 * @cpu: The per CPU buffer to read from. 2935 * @cpu: The per CPU buffer to read from.
2927 */ 2936 */
2928unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) 2937u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2929{ 2938{
2930 unsigned long flags; 2939 unsigned long flags;
2931 struct ring_buffer_per_cpu *cpu_buffer; 2940 struct ring_buffer_per_cpu *cpu_buffer;
2932 struct buffer_page *bpage; 2941 struct buffer_page *bpage;
2933 unsigned long ret; 2942 u64 ret = 0;
2934 2943
2935 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2944 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2936 return 0; 2945 return 0;
@@ -2945,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2945 bpage = cpu_buffer->reader_page; 2954 bpage = cpu_buffer->reader_page;
2946 else 2955 else
2947 bpage = rb_set_head_page(cpu_buffer); 2956 bpage = rb_set_head_page(cpu_buffer);
2948 ret = bpage->page->time_stamp; 2957 if (bpage)
2958 ret = bpage->page->time_stamp;
2949 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2959 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2950 2960
2951 return ret; 2961 return ret;
@@ -2991,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2991EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 3001EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2992 3002
2993/** 3003/**
2994 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 3004 * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
3005 * buffer wrapping around (only if RB_FL_OVERWRITE is on).
2995 * @buffer: The ring buffer 3006 * @buffer: The ring buffer
2996 * @cpu: The per CPU buffer to get the number of overruns from 3007 * @cpu: The per CPU buffer to get the number of overruns from
2997 */ 3008 */
@@ -3011,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3011EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 3022EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
3012 3023
3013/** 3024/**
3014 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 3025 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
3026 * commits failing due to the buffer wrapping around while there are uncommitted
3027 * events, such as during an interrupt storm.
3015 * @buffer: The ring buffer 3028 * @buffer: The ring buffer
3016 * @cpu: The per CPU buffer to get the number of overruns from 3029 * @cpu: The per CPU buffer to get the number of overruns from
3017 */ 3030 */
@@ -3032,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
3032EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); 3045EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
3033 3046
3034/** 3047/**
3048 * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
3049 * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
3050 * @buffer: The ring buffer
3051 * @cpu: The per CPU buffer to get the number of overruns from
3052 */
3053unsigned long
3054ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3055{
3056 struct ring_buffer_per_cpu *cpu_buffer;
3057 unsigned long ret;
3058
3059 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3060 return 0;
3061
3062 cpu_buffer = buffer->buffers[cpu];
3063 ret = local_read(&cpu_buffer->dropped_events);
3064
3065 return ret;
3066}
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068
3069/**
3035 * ring_buffer_entries - get the number of entries in a buffer 3070 * ring_buffer_entries - get the number of entries in a buffer
3036 * @buffer: The ring buffer 3071 * @buffer: The ring buffer
3037 * 3072 *
@@ -3256,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3256 * Splice the empty reader page into the list around the head. 3291 * Splice the empty reader page into the list around the head.
3257 */ 3292 */
3258 reader = rb_set_head_page(cpu_buffer); 3293 reader = rb_set_head_page(cpu_buffer);
3294 if (!reader)
3295 goto out;
3259 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); 3296 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
3260 cpu_buffer->reader_page->list.prev = reader->list.prev; 3297 cpu_buffer->reader_page->list.prev = reader->list.prev;
3261 3298
@@ -3774,12 +3811,17 @@ void
3774ring_buffer_read_finish(struct ring_buffer_iter *iter) 3811ring_buffer_read_finish(struct ring_buffer_iter *iter)
3775{ 3812{
3776 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3813 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3814 unsigned long flags;
3777 3815
3778 /* 3816 /*
3779 * Ring buffer is disabled from recording, here's a good place 3817 * Ring buffer is disabled from recording, here's a good place
3780 * to check the integrity of the ring buffer. 3818 * to check the integrity of the ring buffer.
3819 * Must prevent readers from trying to read, as the check
3820 * clears the HEAD page and readers require it.
3781 */ 3821 */
3822 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3782 rb_check_pages(cpu_buffer); 3823 rb_check_pages(cpu_buffer);
3824 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3783 3825
3784 atomic_dec(&cpu_buffer->record_disabled); 3826 atomic_dec(&cpu_buffer->record_disabled);
3785 atomic_dec(&cpu_buffer->buffer->resize_disabled); 3827 atomic_dec(&cpu_buffer->buffer->resize_disabled);
@@ -3860,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3860 local_set(&cpu_buffer->reader_page->page->commit, 0); 3902 local_set(&cpu_buffer->reader_page->page->commit, 0);
3861 cpu_buffer->reader_page->read = 0; 3903 cpu_buffer->reader_page->read = 0;
3862 3904
3863 local_set(&cpu_buffer->commit_overrun, 0);
3864 local_set(&cpu_buffer->entries_bytes, 0); 3905 local_set(&cpu_buffer->entries_bytes, 0);
3865 local_set(&cpu_buffer->overrun, 0); 3906 local_set(&cpu_buffer->overrun, 0);
3907 local_set(&cpu_buffer->commit_overrun, 0);
3908 local_set(&cpu_buffer->dropped_events, 0);
3866 local_set(&cpu_buffer->entries, 0); 3909 local_set(&cpu_buffer->entries, 0);
3867 local_set(&cpu_buffer->committing, 0); 3910 local_set(&cpu_buffer->committing, 0);
3868 local_set(&cpu_buffer->commits, 0); 3911 local_set(&cpu_buffer->commits, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c38c81496ce..e5125677efa0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Based on code from the latency_tracer, that is: 10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 Nadia Yvette Chambers
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <generated/utsrelease.h> 15#include <generated/utsrelease.h>
@@ -19,6 +19,7 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
78} 79}
79 80
80/* 81/*
82 * To prevent the comm cache from being overwritten when no
83 * tracing is active, only save the comm when a trace event
84 * occurred.
85 */
86static DEFINE_PER_CPU(bool, trace_cmdline_save);
87
88/*
89 * When a reader is waiting for data, then this variable is
90 * set to true.
91 */
92static bool trace_wakeup_needed;
93
94static struct irq_work trace_work_wakeup;
95
96/*
81 * Kill all tracing for good (never come back). 97 * Kill all tracing for good (never come back).
82 * It is initialized to 1 but will turn to zero if the initialization 98 * It is initialized to 1 but will turn to zero if the initialization
83 * of the tracer is successful. But that is the only place that sets 99 * of the tracer is successful. But that is the only place that sets
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str)
139} 155}
140__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 156__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
141 157
158
159static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
160static char *trace_boot_options __initdata;
161
162static int __init set_trace_boot_options(char *str)
163{
164 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
165 trace_boot_options = trace_boot_options_buf;
166 return 0;
167}
168__setup("trace_options=", set_trace_boot_options);
169
142unsigned long long ns2usecs(cycle_t nsec) 170unsigned long long ns2usecs(cycle_t nsec)
143{ 171{
144 nsec += 500; 172 nsec += 500;
@@ -198,20 +226,9 @@ static struct trace_array max_tr;
198 226
199static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); 227static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
200 228
201/* tracer_enabled is used to toggle activation of a tracer */
202static int tracer_enabled = 1;
203
204/**
205 * tracing_is_enabled - return tracer_enabled status
206 *
207 * This function is used by other tracers to know the status
208 * of the tracer_enabled flag. Tracers may use this function
209 * to know if it should enable their features when starting
210 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
211 */
212int tracing_is_enabled(void) 229int tracing_is_enabled(void)
213{ 230{
214 return tracer_enabled; 231 return tracing_is_on();
215} 232}
216 233
217/* 234/*
@@ -328,17 +345,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
328unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
329 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 347 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
331 TRACE_ITER_IRQ_INFO; 348 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
332 349
333static int trace_stop_count; 350static int trace_stop_count;
334static DEFINE_RAW_SPINLOCK(tracing_start_lock); 351static DEFINE_RAW_SPINLOCK(tracing_start_lock);
335 352
336static void wakeup_work_handler(struct work_struct *work) 353/**
354 * trace_wake_up - wake up tasks waiting for trace input
355 *
356 * Schedules a delayed work to wake up any task that is blocked on the
357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
359 */
360static void trace_wake_up(struct irq_work *work)
337{ 361{
338 wake_up(&trace_wait); 362 wake_up_all(&trace_wait);
339}
340 363
341static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); 364}
342 365
343/** 366/**
344 * tracing_on - enable tracing buffers 367 * tracing_on - enable tracing buffers
@@ -393,22 +416,6 @@ int tracing_is_on(void)
393} 416}
394EXPORT_SYMBOL_GPL(tracing_is_on); 417EXPORT_SYMBOL_GPL(tracing_is_on);
395 418
396/**
397 * trace_wake_up - wake up tasks waiting for trace input
398 *
399 * Schedules a delayed work to wake up any task that is blocked on the
400 * trace_wait queue. These is used with trace_poll for tasks polling the
401 * trace.
402 */
403void trace_wake_up(void)
404{
405 const unsigned long delay = msecs_to_jiffies(2);
406
407 if (trace_flags & TRACE_ITER_BLOCK)
408 return;
409 schedule_delayed_work(&wakeup_work, delay);
410}
411
412static int __init set_buf_size(char *str) 419static int __init set_buf_size(char *str)
413{ 420{
414 unsigned long buf_size; 421 unsigned long buf_size;
@@ -426,15 +433,15 @@ __setup("trace_buf_size=", set_buf_size);
426 433
427static int __init set_tracing_thresh(char *str) 434static int __init set_tracing_thresh(char *str)
428{ 435{
429 unsigned long threshhold; 436 unsigned long threshold;
430 int ret; 437 int ret;
431 438
432 if (!str) 439 if (!str)
433 return 0; 440 return 0;
434 ret = strict_strtoul(str, 0, &threshhold); 441 ret = kstrtoul(str, 0, &threshold);
435 if (ret < 0) 442 if (ret < 0)
436 return 0; 443 return 0;
437 tracing_thresh = threshhold * 1000; 444 tracing_thresh = threshold * 1000;
438 return 1; 445 return 1;
439} 446}
440__setup("tracing_thresh=", set_tracing_thresh); 447__setup("tracing_thresh=", set_tracing_thresh);
@@ -470,16 +477,19 @@ static const char *trace_options[] = {
470 "overwrite", 477 "overwrite",
471 "disable_on_free", 478 "disable_on_free",
472 "irq-info", 479 "irq-info",
480 "markers",
473 NULL 481 NULL
474}; 482};
475 483
476static struct { 484static struct {
477 u64 (*func)(void); 485 u64 (*func)(void);
478 const char *name; 486 const char *name;
487 int in_ns; /* is this clock in nanoseconds? */
479} trace_clocks[] = { 488} trace_clocks[] = {
480 { trace_clock_local, "local" }, 489 { trace_clock_local, "local", 1 },
481 { trace_clock_global, "global" }, 490 { trace_clock_global, "global", 1 },
482 { trace_clock_counter, "counter" }, 491 { trace_clock_counter, "counter", 0 },
492 ARCH_TRACE_CLOCKS
483}; 493};
484 494
485int trace_clock_id; 495int trace_clock_id;
@@ -756,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
756} 766}
757#endif /* CONFIG_TRACER_MAX_TRACE */ 767#endif /* CONFIG_TRACER_MAX_TRACE */
758 768
769static void default_wait_pipe(struct trace_iterator *iter)
770{
771 DEFINE_WAIT(wait);
772
773 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
774
775 /*
776 * The events can happen in critical sections where
777 * checking a work queue can cause deadlocks.
778 * After adding a task to the queue, this flag is set
779 * only to notify events to try to wake up the queue
780 * using irq_work.
781 *
782 * We don't clear it even if the buffer is no longer
783 * empty. The flag only causes the next event to run
784 * irq_work to do the work queue wake up. The worse
785 * that can happen if we race with !trace_empty() is that
786 * an event will cause an irq_work to try to wake up
787 * an empty queue.
788 *
789 * There's no reason to protect this flag either, as
790 * the work queue and irq_work logic will do the necessary
791 * synchronization for the wake ups. The only thing
792 * that is necessary is that the wake up happens after
793 * a task has been queued. It's OK for spurious wake ups.
794 */
795 trace_wakeup_needed = true;
796
797 if (trace_empty(iter))
798 schedule();
799
800 finish_wait(&trace_wait, &wait);
801}
802
759/** 803/**
760 * register_tracer - register a tracer with the ftrace system. 804 * register_tracer - register a tracer with the ftrace system.
761 * @type - the plugin for the tracer 805 * @type - the plugin for the tracer
@@ -874,32 +918,6 @@ int register_tracer(struct tracer *type)
874 return ret; 918 return ret;
875} 919}
876 920
877void unregister_tracer(struct tracer *type)
878{
879 struct tracer **t;
880
881 mutex_lock(&trace_types_lock);
882 for (t = &trace_types; *t; t = &(*t)->next) {
883 if (*t == type)
884 goto found;
885 }
886 pr_info("Tracer %s not registered\n", type->name);
887 goto out;
888
889 found:
890 *t = (*t)->next;
891
892 if (type == current_trace && tracer_enabled) {
893 tracer_enabled = 0;
894 tracing_stop();
895 if (current_trace->stop)
896 current_trace->stop(&global_trace);
897 current_trace = &nop_trace;
898 }
899out:
900 mutex_unlock(&trace_types_lock);
901}
902
903void tracing_reset(struct trace_array *tr, int cpu) 921void tracing_reset(struct trace_array *tr, int cpu)
904{ 922{
905 struct ring_buffer *buffer = tr->buffer; 923 struct ring_buffer *buffer = tr->buffer;
@@ -1130,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[])
1130 1148
1131void tracing_record_cmdline(struct task_struct *tsk) 1149void tracing_record_cmdline(struct task_struct *tsk)
1132{ 1150{
1133 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || 1151 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
1134 !tracing_is_on()) 1152 return;
1153
1154 if (!__this_cpu_read(trace_cmdline_save))
1135 return; 1155 return;
1136 1156
1157 __this_cpu_write(trace_cmdline_save, false);
1158
1137 trace_save_cmdline(tsk); 1159 trace_save_cmdline(tsk);
1138} 1160}
1139 1161
@@ -1177,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
1177 return event; 1199 return event;
1178} 1200}
1179 1201
1202void
1203__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1204{
1205 __this_cpu_write(trace_cmdline_save, true);
1206 if (trace_wakeup_needed) {
1207 trace_wakeup_needed = false;
1208 /* irq_work_queue() supplies it's own memory barriers */
1209 irq_work_queue(&trace_work_wakeup);
1210 }
1211 ring_buffer_unlock_commit(buffer, event);
1212}
1213
1180static inline void 1214static inline void
1181__trace_buffer_unlock_commit(struct ring_buffer *buffer, 1215__trace_buffer_unlock_commit(struct ring_buffer *buffer,
1182 struct ring_buffer_event *event, 1216 struct ring_buffer_event *event,
1183 unsigned long flags, int pc, 1217 unsigned long flags, int pc)
1184 int wake)
1185{ 1218{
1186 ring_buffer_unlock_commit(buffer, event); 1219 __buffer_unlock_commit(buffer, event);
1187 1220
1188 ftrace_trace_stack(buffer, flags, 6, pc); 1221 ftrace_trace_stack(buffer, flags, 6, pc);
1189 ftrace_trace_userstack(buffer, flags, pc); 1222 ftrace_trace_userstack(buffer, flags, pc);
1190
1191 if (wake)
1192 trace_wake_up();
1193} 1223}
1194 1224
1195void trace_buffer_unlock_commit(struct ring_buffer *buffer, 1225void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1196 struct ring_buffer_event *event, 1226 struct ring_buffer_event *event,
1197 unsigned long flags, int pc) 1227 unsigned long flags, int pc)
1198{ 1228{
1199 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1229 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1200} 1230}
1231EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1201 1232
1202struct ring_buffer_event * 1233struct ring_buffer_event *
1203trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1234trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1214,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1214 struct ring_buffer_event *event, 1245 struct ring_buffer_event *event,
1215 unsigned long flags, int pc) 1246 unsigned long flags, int pc)
1216{ 1247{
1217 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1248 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1218} 1249}
1219EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1250EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
1220 1251
1221void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, 1252void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1222 struct ring_buffer_event *event, 1253 struct ring_buffer_event *event,
1223 unsigned long flags, int pc) 1254 unsigned long flags, int pc,
1224{ 1255 struct pt_regs *regs)
1225 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
1226}
1227EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1228
1229void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1230 struct ring_buffer_event *event,
1231 unsigned long flags, int pc,
1232 struct pt_regs *regs)
1233{ 1256{
1234 ring_buffer_unlock_commit(buffer, event); 1257 __buffer_unlock_commit(buffer, event);
1235 1258
1236 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); 1259 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1237 ftrace_trace_userstack(buffer, flags, pc); 1260 ftrace_trace_userstack(buffer, flags, pc);
1238} 1261}
1239EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); 1262EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
1240 1263
1241void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1264void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1242 struct ring_buffer_event *event) 1265 struct ring_buffer_event *event)
@@ -1268,7 +1291,7 @@ trace_function(struct trace_array *tr,
1268 entry->parent_ip = parent_ip; 1291 entry->parent_ip = parent_ip;
1269 1292
1270 if (!filter_check_discard(call, entry, buffer, event)) 1293 if (!filter_check_discard(call, entry, buffer, event))
1271 ring_buffer_unlock_commit(buffer, event); 1294 __buffer_unlock_commit(buffer, event);
1272} 1295}
1273 1296
1274void 1297void
@@ -1361,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1361 entry->size = trace.nr_entries; 1384 entry->size = trace.nr_entries;
1362 1385
1363 if (!filter_check_discard(call, entry, buffer, event)) 1386 if (!filter_check_discard(call, entry, buffer, event))
1364 ring_buffer_unlock_commit(buffer, event); 1387 __buffer_unlock_commit(buffer, event);
1365 1388
1366 out: 1389 out:
1367 /* Again, don't let gcc optimize things here */ 1390 /* Again, don't let gcc optimize things here */
@@ -1457,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1457 1480
1458 save_stack_trace_user(&trace); 1481 save_stack_trace_user(&trace);
1459 if (!filter_check_discard(call, entry, buffer, event)) 1482 if (!filter_check_discard(call, entry, buffer, event))
1460 ring_buffer_unlock_commit(buffer, event); 1483 __buffer_unlock_commit(buffer, event);
1461 1484
1462 out_drop_count: 1485 out_drop_count:
1463 __this_cpu_dec(user_stack_count); 1486 __this_cpu_dec(user_stack_count);
@@ -1558,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void)
1558 return -ENOMEM; 1581 return -ENOMEM;
1559} 1582}
1560 1583
1584static int buffers_allocated;
1585
1561void trace_printk_init_buffers(void) 1586void trace_printk_init_buffers(void)
1562{ 1587{
1563 static int buffers_allocated;
1564
1565 if (buffers_allocated) 1588 if (buffers_allocated)
1566 return; 1589 return;
1567 1590
@@ -1570,7 +1593,38 @@ void trace_printk_init_buffers(void)
1570 1593
1571 pr_info("ftrace: Allocated trace_printk buffers\n"); 1594 pr_info("ftrace: Allocated trace_printk buffers\n");
1572 1595
1596 /* Expand the buffers to set size */
1597 tracing_update_buffers();
1598
1573 buffers_allocated = 1; 1599 buffers_allocated = 1;
1600
1601 /*
1602 * trace_printk_init_buffers() can be called by modules.
1603 * If that happens, then we need to start cmdline recording
1604 * directly here. If the global_trace.buffer is already
1605 * allocated here, then this was called by module code.
1606 */
1607 if (global_trace.buffer)
1608 tracing_start_cmdline_record();
1609}
1610
1611void trace_printk_start_comm(void)
1612{
1613 /* Start tracing comms if trace printk is set */
1614 if (!buffers_allocated)
1615 return;
1616 tracing_start_cmdline_record();
1617}
1618
1619static void trace_printk_start_stop_comm(int enabled)
1620{
1621 if (!buffers_allocated)
1622 return;
1623
1624 if (enabled)
1625 tracing_start_cmdline_record();
1626 else
1627 tracing_stop_cmdline_record();
1574} 1628}
1575 1629
1576/** 1630/**
@@ -1621,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1621 1675
1622 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 1676 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1623 if (!filter_check_discard(call, entry, buffer, event)) { 1677 if (!filter_check_discard(call, entry, buffer, event)) {
1624 ring_buffer_unlock_commit(buffer, event); 1678 __buffer_unlock_commit(buffer, event);
1625 ftrace_trace_stack(buffer, flags, 6, pc); 1679 ftrace_trace_stack(buffer, flags, 6, pc);
1626 } 1680 }
1627 1681
@@ -1692,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr,
1692 memcpy(&entry->buf, tbuffer, len); 1746 memcpy(&entry->buf, tbuffer, len);
1693 entry->buf[len] = '\0'; 1747 entry->buf[len] = '\0';
1694 if (!filter_check_discard(call, entry, buffer, event)) { 1748 if (!filter_check_discard(call, entry, buffer, event)) {
1695 ring_buffer_unlock_commit(buffer, event); 1749 __buffer_unlock_commit(buffer, event);
1696 ftrace_trace_stack(buffer, flags, 6, pc); 1750 ftrace_trace_stack(buffer, flags, 6, pc);
1697 } 1751 }
1698 out: 1752 out:
@@ -2060,7 +2114,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2060 seq_puts(m, "# -----------------\n"); 2114 seq_puts(m, "# -----------------\n");
2061 seq_printf(m, "# | task: %.16s-%d " 2115 seq_printf(m, "# | task: %.16s-%d "
2062 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 2116 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
2063 data->comm, data->pid, data->uid, data->nice, 2117 data->comm, data->pid,
2118 from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
2064 data->policy, data->rt_priority); 2119 data->policy, data->rt_priority);
2065 seq_puts(m, "# -----------------\n"); 2120 seq_puts(m, "# -----------------\n");
2066 2121
@@ -2424,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file)
2424 if (ring_buffer_overruns(iter->tr->buffer)) 2479 if (ring_buffer_overruns(iter->tr->buffer))
2425 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2480 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2426 2481
2482 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2483 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485
2427 /* stop the trace while dumping */ 2486 /* stop the trace while dumping */
2428 tracing_stop(); 2487 tracing_stop();
2429 2488
@@ -2792,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2792 2851
2793 if (mask == TRACE_ITER_OVERWRITE) 2852 if (mask == TRACE_ITER_OVERWRITE)
2794 ring_buffer_change_overwrite(global_trace.buffer, enabled); 2853 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2854
2855 if (mask == TRACE_ITER_PRINTK)
2856 trace_printk_start_stop_comm(enabled);
2795} 2857}
2796 2858
2797static ssize_t 2859static int trace_set_options(char *option)
2798tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2799 size_t cnt, loff_t *ppos)
2800{ 2860{
2801 char buf[64];
2802 char *cmp; 2861 char *cmp;
2803 int neg = 0; 2862 int neg = 0;
2804 int ret; 2863 int ret = 0;
2805 int i; 2864 int i;
2806 2865
2807 if (cnt >= sizeof(buf)) 2866 cmp = strstrip(option);
2808 return -EINVAL;
2809
2810 if (copy_from_user(&buf, ubuf, cnt))
2811 return -EFAULT;
2812
2813 buf[cnt] = 0;
2814 cmp = strstrip(buf);
2815 2867
2816 if (strncmp(cmp, "no", 2) == 0) { 2868 if (strncmp(cmp, "no", 2) == 0) {
2817 neg = 1; 2869 neg = 1;
@@ -2830,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2830 mutex_lock(&trace_types_lock); 2882 mutex_lock(&trace_types_lock);
2831 ret = set_tracer_option(current_trace, cmp, neg); 2883 ret = set_tracer_option(current_trace, cmp, neg);
2832 mutex_unlock(&trace_types_lock); 2884 mutex_unlock(&trace_types_lock);
2833 if (ret)
2834 return ret;
2835 } 2885 }
2836 2886
2887 return ret;
2888}
2889
2890static ssize_t
2891tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2892 size_t cnt, loff_t *ppos)
2893{
2894 char buf[64];
2895
2896 if (cnt >= sizeof(buf))
2897 return -EINVAL;
2898
2899 if (copy_from_user(&buf, ubuf, cnt))
2900 return -EFAULT;
2901
2902 trace_set_options(buf);
2903
2837 *ppos += cnt; 2904 *ppos += cnt;
2838 2905
2839 return cnt; 2906 return cnt;
@@ -2938,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
2938}; 3005};
2939 3006
2940static ssize_t 3007static ssize_t
2941tracing_ctrl_read(struct file *filp, char __user *ubuf,
2942 size_t cnt, loff_t *ppos)
2943{
2944 char buf[64];
2945 int r;
2946
2947 r = sprintf(buf, "%u\n", tracer_enabled);
2948 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2949}
2950
2951static ssize_t
2952tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2953 size_t cnt, loff_t *ppos)
2954{
2955 struct trace_array *tr = filp->private_data;
2956 unsigned long val;
2957 int ret;
2958
2959 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2960 if (ret)
2961 return ret;
2962
2963 val = !!val;
2964
2965 mutex_lock(&trace_types_lock);
2966 if (tracer_enabled ^ val) {
2967
2968 /* Only need to warn if this is used to change the state */
2969 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2970
2971 if (val) {
2972 tracer_enabled = 1;
2973 if (current_trace->start)
2974 current_trace->start(tr);
2975 tracing_start();
2976 } else {
2977 tracer_enabled = 0;
2978 tracing_stop();
2979 if (current_trace->stop)
2980 current_trace->stop(tr);
2981 }
2982 }
2983 mutex_unlock(&trace_types_lock);
2984
2985 *ppos += cnt;
2986
2987 return cnt;
2988}
2989
2990static ssize_t
2991tracing_set_trace_read(struct file *filp, char __user *ubuf, 3008tracing_set_trace_read(struct file *filp, char __user *ubuf,
2992 size_t cnt, loff_t *ppos) 3009 size_t cnt, loff_t *ppos)
2993{ 3010{
@@ -3017,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
3017 tr->data[cpu]->entries = val; 3034 tr->data[cpu]->entries = val;
3018} 3035}
3019 3036
3037/* resize @tr's buffer to the size of @size_tr's entries */
3038static int resize_buffer_duplicate_size(struct trace_array *tr,
3039 struct trace_array *size_tr, int cpu_id)
3040{
3041 int cpu, ret = 0;
3042
3043 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3044 for_each_tracing_cpu(cpu) {
3045 ret = ring_buffer_resize(tr->buffer,
3046 size_tr->data[cpu]->entries, cpu);
3047 if (ret < 0)
3048 break;
3049 tr->data[cpu]->entries = size_tr->data[cpu]->entries;
3050 }
3051 } else {
3052 ret = ring_buffer_resize(tr->buffer,
3053 size_tr->data[cpu_id]->entries, cpu_id);
3054 if (ret == 0)
3055 tr->data[cpu_id]->entries =
3056 size_tr->data[cpu_id]->entries;
3057 }
3058
3059 return ret;
3060}
3061
3020static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3062static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3021{ 3063{
3022 int ret; 3064 int ret;
@@ -3028,6 +3070,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3028 */ 3070 */
3029 ring_buffer_expanded = 1; 3071 ring_buffer_expanded = 1;
3030 3072
3073 /* May be called before buffers are initialized */
3074 if (!global_trace.buffer)
3075 return 0;
3076
3031 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3077 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
3032 if (ret < 0) 3078 if (ret < 0)
3033 return ret; 3079 return ret;
@@ -3037,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3037 3083
3038 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3084 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
3039 if (ret < 0) { 3085 if (ret < 0) {
3040 int r = 0; 3086 int r = resize_buffer_duplicate_size(&global_trace,
3041 3087 &global_trace, cpu);
3042 if (cpu == RING_BUFFER_ALL_CPUS) {
3043 int i;
3044 for_each_tracing_cpu(i) {
3045 r = ring_buffer_resize(global_trace.buffer,
3046 global_trace.data[i]->entries,
3047 i);
3048 if (r < 0)
3049 break;
3050 }
3051 } else {
3052 r = ring_buffer_resize(global_trace.buffer,
3053 global_trace.data[cpu]->entries,
3054 cpu);
3055 }
3056
3057 if (r < 0) { 3088 if (r < 0) {
3058 /* 3089 /*
3059 * AARGH! We are left with different 3090 * AARGH! We are left with different
@@ -3191,17 +3222,11 @@ static int tracing_set_tracer(const char *buf)
3191 3222
3192 topts = create_trace_option_files(t); 3223 topts = create_trace_option_files(t);
3193 if (t->use_max_tr) { 3224 if (t->use_max_tr) {
3194 int cpu;
3195 /* we need to make per cpu buffer sizes equivalent */ 3225 /* we need to make per cpu buffer sizes equivalent */
3196 for_each_tracing_cpu(cpu) { 3226 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3197 ret = ring_buffer_resize(max_tr.buffer, 3227 RING_BUFFER_ALL_CPUS);
3198 global_trace.data[cpu]->entries, 3228 if (ret < 0)
3199 cpu); 3229 goto out;
3200 if (ret < 0)
3201 goto out;
3202 max_tr.data[cpu]->entries =
3203 global_trace.data[cpu]->entries;
3204 }
3205 } 3230 }
3206 3231
3207 if (t->init) { 3232 if (t->init) {
@@ -3323,6 +3348,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3323 if (trace_flags & TRACE_ITER_LATENCY_FMT) 3348 if (trace_flags & TRACE_ITER_LATENCY_FMT)
3324 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3349 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3325 3350
3351 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3352 if (trace_clocks[trace_clock_id].in_ns)
3353 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3354
3326 iter->cpu_file = cpu_file; 3355 iter->cpu_file = cpu_file;
3327 iter->tr = &global_trace; 3356 iter->tr = &global_trace;
3328 mutex_init(&iter->mutex); 3357 mutex_init(&iter->mutex);
@@ -3383,19 +3412,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3383 } 3412 }
3384} 3413}
3385 3414
3386
3387void default_wait_pipe(struct trace_iterator *iter)
3388{
3389 DEFINE_WAIT(wait);
3390
3391 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3392
3393 if (trace_empty(iter))
3394 schedule();
3395
3396 finish_wait(&trace_wait, &wait);
3397}
3398
3399/* 3415/*
3400 * This is a make-shift waitqueue. 3416 * This is a make-shift waitqueue.
3401 * A tracer might use this callback on some rare cases: 3417 * A tracer might use this callback on some rare cases:
@@ -3436,7 +3452,7 @@ static int tracing_wait_pipe(struct file *filp)
3436 return -EINTR; 3452 return -EINTR;
3437 3453
3438 /* 3454 /*
3439 * We block until we read something and tracing is disabled. 3455 * We block until we read something and tracing is enabled.
3440 * We still block if tracing is disabled, but we have never 3456 * We still block if tracing is disabled, but we have never
3441 * read anything. This allows a user to cat this file, and 3457 * read anything. This allows a user to cat this file, and
3442 * then enable tracing. But after we have read something, 3458 * then enable tracing. But after we have read something,
@@ -3444,7 +3460,7 @@ static int tracing_wait_pipe(struct file *filp)
3444 * 3460 *
3445 * iter->pos will be 0 if we haven't read anything. 3461 * iter->pos will be 0 if we haven't read anything.
3446 */ 3462 */
3447 if (!tracer_enabled && iter->pos) 3463 if (tracing_is_enabled() && iter->pos)
3448 break; 3464 break;
3449 } 3465 }
3450 3466
@@ -3886,6 +3902,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3886 if (tracing_disabled) 3902 if (tracing_disabled)
3887 return -EINVAL; 3903 return -EINVAL;
3888 3904
3905 if (!(trace_flags & TRACE_ITER_MARKERS))
3906 return -EINVAL;
3907
3889 if (cnt > TRACE_BUF_SIZE) 3908 if (cnt > TRACE_BUF_SIZE)
3890 cnt = TRACE_BUF_SIZE; 3909 cnt = TRACE_BUF_SIZE;
3891 3910
@@ -3950,7 +3969,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3950 } else 3969 } else
3951 entry->buf[cnt] = '\0'; 3970 entry->buf[cnt] = '\0';
3952 3971
3953 ring_buffer_unlock_commit(buffer, event); 3972 __buffer_unlock_commit(buffer, event);
3954 3973
3955 written = cnt; 3974 written = cnt;
3956 3975
@@ -4011,6 +4030,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4011 if (max_tr.buffer) 4030 if (max_tr.buffer)
4012 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4031 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
4013 4032
4033 /*
4034 * New clock may not be consistent with the previous clock.
4035 * Reset the buffer so that it doesn't have incomparable timestamps.
4036 */
4037 tracing_reset_online_cpus(&global_trace);
4038 if (max_tr.buffer)
4039 tracing_reset_online_cpus(&max_tr);
4040
4014 mutex_unlock(&trace_types_lock); 4041 mutex_unlock(&trace_types_lock);
4015 4042
4016 *fpos += cnt; 4043 *fpos += cnt;
@@ -4032,13 +4059,6 @@ static const struct file_operations tracing_max_lat_fops = {
4032 .llseek = generic_file_llseek, 4059 .llseek = generic_file_llseek,
4033}; 4060};
4034 4061
4035static const struct file_operations tracing_ctrl_fops = {
4036 .open = tracing_open_generic,
4037 .read = tracing_ctrl_read,
4038 .write = tracing_ctrl_write,
4039 .llseek = generic_file_llseek,
4040};
4041
4042static const struct file_operations set_tracer_fops = { 4062static const struct file_operations set_tracer_fops = {
4043 .open = tracing_open_generic, 4063 .open = tracing_open_generic,
4044 .read = tracing_set_trace_read, 4064 .read = tracing_set_trace_read,
@@ -4195,12 +4215,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
4195 buf->private = 0; 4215 buf->private = 0;
4196} 4216}
4197 4217
4198static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
4199 struct pipe_buffer *buf)
4200{
4201 return 1;
4202}
4203
4204static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 4218static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
4205 struct pipe_buffer *buf) 4219 struct pipe_buffer *buf)
4206{ 4220{
@@ -4216,7 +4230,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
4216 .unmap = generic_pipe_buf_unmap, 4230 .unmap = generic_pipe_buf_unmap,
4217 .confirm = generic_pipe_buf_confirm, 4231 .confirm = generic_pipe_buf_confirm,
4218 .release = buffer_pipe_buf_release, 4232 .release = buffer_pipe_buf_release,
4219 .steal = buffer_pipe_buf_steal, 4233 .steal = generic_pipe_buf_steal,
4220 .get = buffer_pipe_buf_get, 4234 .get = buffer_pipe_buf_get,
4221}; 4235};
4222 4236
@@ -4261,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4261 return -ENOMEM; 4275 return -ENOMEM;
4262 4276
4263 if (*ppos & (PAGE_SIZE - 1)) { 4277 if (*ppos & (PAGE_SIZE - 1)) {
4264 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
4265 ret = -EINVAL; 4278 ret = -EINVAL;
4266 goto out; 4279 goto out;
4267 } 4280 }
4268 4281
4269 if (len & (PAGE_SIZE - 1)) { 4282 if (len & (PAGE_SIZE - 1)) {
4270 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
4271 if (len < PAGE_SIZE) { 4283 if (len < PAGE_SIZE) {
4272 ret = -EINVAL; 4284 ret = -EINVAL;
4273 goto out; 4285 goto out;
@@ -4378,13 +4390,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4378 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 4390 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4379 trace_seq_printf(s, "bytes: %ld\n", cnt); 4391 trace_seq_printf(s, "bytes: %ld\n", cnt);
4380 4392
4381 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 4393 if (trace_clocks[trace_clock_id].in_ns) {
4382 usec_rem = do_div(t, USEC_PER_SEC); 4394 /* local or global for trace_clock */
4383 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); 4395 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4396 usec_rem = do_div(t, USEC_PER_SEC);
4397 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4398 t, usec_rem);
4399
4400 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4401 usec_rem = do_div(t, USEC_PER_SEC);
4402 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4403 } else {
4404 /* counter or tsc mode for trace_clock */
4405 trace_seq_printf(s, "oldest event ts: %llu\n",
4406 ring_buffer_oldest_event_ts(tr->buffer, cpu));
4407
4408 trace_seq_printf(s, "now ts: %llu\n",
4409 ring_buffer_time_stamp(tr->buffer, cpu));
4410 }
4384 4411
4385 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 4412 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4386 usec_rem = do_div(t, USEC_PER_SEC); 4413 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4387 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4388 4414
4389 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4415 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4390 4416
@@ -4816,9 +4842,6 @@ static __init int tracer_init_debugfs(void)
4816 4842
4817 d_tracer = tracing_init_dentry(); 4843 d_tracer = tracing_init_dentry();
4818 4844
4819 trace_create_file("tracing_enabled", 0644, d_tracer,
4820 &global_trace, &tracing_ctrl_fops);
4821
4822 trace_create_file("trace_options", 0644, d_tracer, 4845 trace_create_file("trace_options", 0644, d_tracer,
4823 NULL, &tracing_iter_fops); 4846 NULL, &tracing_iter_fops);
4824 4847
@@ -5090,6 +5113,7 @@ __init static int tracer_alloc_buffers(void)
5090 5113
5091 /* Only allocate trace_printk buffers if a trace_printk exists */ 5114 /* Only allocate trace_printk buffers if a trace_printk exists */
5092 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) 5115 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5116 /* Must be called before global_trace.buffer is allocated */
5093 trace_printk_init_buffers(); 5117 trace_printk_init_buffers();
5094 5118
5095 /* To save memory, keep the ring buffer size to its minimum */ 5119 /* To save memory, keep the ring buffer size to its minimum */
@@ -5137,6 +5161,7 @@ __init static int tracer_alloc_buffers(void)
5137#endif 5161#endif
5138 5162
5139 trace_init_cmdlines(); 5163 trace_init_cmdlines();
5164 init_irq_work(&trace_work_wakeup, trace_wake_up);
5140 5165
5141 register_tracer(&nop_trace); 5166 register_tracer(&nop_trace);
5142 current_trace = &nop_trace; 5167 current_trace = &nop_trace;
@@ -5148,6 +5173,13 @@ __init static int tracer_alloc_buffers(void)
5148 5173
5149 register_die_notifier(&trace_die_notifier); 5174 register_die_notifier(&trace_die_notifier);
5150 5175
5176 while (trace_boot_options) {
5177 char *option;
5178
5179 option = strsep(&trace_boot_options, ",");
5180 trace_set_options(option);
5181 }
5182
5151 return 0; 5183 return 0;
5152 5184
5153out_free_cpumask: 5185out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 55e1f7f0db12..c75d7988902c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -147,7 +147,7 @@ struct trace_array_cpu {
147 unsigned long skipped_entries; 147 unsigned long skipped_entries;
148 cycle_t preempt_timestamp; 148 cycle_t preempt_timestamp;
149 pid_t pid; 149 pid_t pid;
150 uid_t uid; 150 kuid_t uid;
151 char comm[TASK_COMM_LEN]; 151 char comm[TASK_COMM_LEN];
152}; 152};
153 153
@@ -285,8 +285,8 @@ struct tracer {
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 285 int (*set_flag)(u32 old_flags, u32 bit, int set);
286 struct tracer *next; 286 struct tracer *next;
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 int print_max; 288 bool print_max;
289 int use_max_tr; 289 bool use_max_tr;
290}; 290};
291 291
292 292
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
327 327
328int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
329int tracing_is_enabled(void); 329int tracing_is_enabled(void);
330void trace_wake_up(void);
331void tracing_reset(struct trace_array *tr, int cpu); 330void tracing_reset(struct trace_array *tr, int cpu);
332void tracing_reset_online_cpus(struct trace_array *tr); 331void tracing_reset_online_cpus(struct trace_array *tr);
333void tracing_reset_current(int cpu); 332void tracing_reset_current(int cpu);
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
349 unsigned long len, 348 unsigned long len,
350 unsigned long flags, 349 unsigned long flags,
351 int pc); 350 int pc);
352void trace_buffer_unlock_commit(struct ring_buffer *buffer,
353 struct ring_buffer_event *event,
354 unsigned long flags, int pc);
355 351
356struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 352struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
357 struct trace_array_cpu *data); 353 struct trace_array_cpu *data);
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
359struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 355struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
360 int *ent_cpu, u64 *ent_ts); 356 int *ent_cpu, u64 *ent_ts);
361 357
358void __buffer_unlock_commit(struct ring_buffer *buffer,
359 struct ring_buffer_event *event);
360
362int trace_empty(struct trace_iterator *iter); 361int trace_empty(struct trace_iterator *iter);
363 362
364void *trace_find_next_entry_inc(struct trace_iterator *iter); 363void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
367 366
368void tracing_iter_reset(struct trace_iterator *iter, int cpu); 367void tracing_iter_reset(struct trace_iterator *iter, int cpu);
369 368
370void default_wait_pipe(struct trace_iterator *iter);
371void poll_wait_pipe(struct trace_iterator *iter); 369void poll_wait_pipe(struct trace_iterator *iter);
372 370
373void ftrace(struct trace_array *tr, 371void ftrace(struct trace_array *tr,
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
407void tracing_stop_sched_switch_record(void); 405void tracing_stop_sched_switch_record(void);
408void tracing_start_sched_switch_record(void); 406void tracing_start_sched_switch_record(void);
409int register_tracer(struct tracer *type); 407int register_tracer(struct tracer *type);
410void unregister_tracer(struct tracer *type);
411int is_tracing_stopped(void); 408int is_tracing_stopped(void);
412enum trace_file_type {
413 TRACE_FILE_LAT_FMT = 1,
414 TRACE_FILE_ANNOTATE = 2,
415};
416 409
417extern cpumask_var_t __read_mostly tracing_buffer_mask; 410extern cpumask_var_t __read_mostly tracing_buffer_mask;
418 411
@@ -472,11 +465,11 @@ extern void trace_find_cmdline(int pid, char comm[]);
472 465
473#ifdef CONFIG_DYNAMIC_FTRACE 466#ifdef CONFIG_DYNAMIC_FTRACE
474extern unsigned long ftrace_update_tot_cnt; 467extern unsigned long ftrace_update_tot_cnt;
468#endif
475#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 469#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
476extern int DYN_FTRACE_TEST_NAME(void); 470extern int DYN_FTRACE_TEST_NAME(void);
477#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 471#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
478extern int DYN_FTRACE_TEST_NAME2(void); 472extern int DYN_FTRACE_TEST_NAME2(void);
479#endif
480 473
481extern int ring_buffer_expanded; 474extern int ring_buffer_expanded;
482extern bool tracing_selftest_disabled; 475extern bool tracing_selftest_disabled;
@@ -680,6 +673,7 @@ enum trace_iterator_flags {
680 TRACE_ITER_OVERWRITE = 0x200000, 673 TRACE_ITER_OVERWRITE = 0x200000,
681 TRACE_ITER_STOP_ON_FREE = 0x400000, 674 TRACE_ITER_STOP_ON_FREE = 0x400000,
682 TRACE_ITER_IRQ_INFO = 0x800000, 675 TRACE_ITER_IRQ_INFO = 0x800000,
676 TRACE_ITER_MARKERS = 0x1000000,
683}; 677};
684 678
685/* 679/*
@@ -840,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[];
840extern const char *__stop___trace_bprintk_fmt[]; 834extern const char *__stop___trace_bprintk_fmt[];
841 835
842void trace_printk_init_buffers(void); 836void trace_printk_init_buffers(void);
837void trace_printk_start_comm(void);
843 838
844#undef FTRACE_ENTRY 839#undef FTRACE_ENTRY
845#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 840#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5f..95e96842ed29 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
77 entry->correct = val == expect; 77 entry->correct = val == expect;
78 78
79 if (!filter_check_discard(call, entry, buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
80 ring_buffer_unlock_commit(buffer, event); 80 __buffer_unlock_commit(buffer, event);
81 81
82 out: 82 out:
83 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
199 } 199 }
200 return register_tracer(&branch_trace); 200 return register_tracer(&branch_trace);
201} 201}
202device_initcall(init_branch_tracer); 202core_initcall(init_branch_tracer);
203 203
204#else 204#else
205static inline 205static inline
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a6d2ee2086c..84b1e045faba 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
258 258
259#ifdef CONFIG_FUNCTION_TRACER 259#ifdef CONFIG_FUNCTION_TRACER
260static void 260static void
261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) 261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
262 struct ftrace_ops *ops, struct pt_regs *pt_regs)
262{ 263{
263 struct ftrace_entry *entry; 264 struct ftrace_entry *entry;
264 struct hlist_head *head; 265 struct hlist_head *head;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29111da1d100..880073d0b946 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p)
491 mutex_unlock(&event_mutex); 491 mutex_unlock(&event_mutex);
492} 492}
493 493
494static int
495ftrace_event_seq_open(struct inode *inode, struct file *file)
496{
497 const struct seq_operations *seq_ops;
498
499 if ((file->f_mode & FMODE_WRITE) &&
500 (file->f_flags & O_TRUNC))
501 ftrace_clear_events();
502
503 seq_ops = inode->i_private;
504 return seq_open(file, seq_ops);
505}
506
507static ssize_t 494static ssize_t
508event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 495event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
509 loff_t *ppos) 496 loff_t *ppos)
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
980 return r; 967 return r;
981} 968}
982 969
970static int ftrace_event_avail_open(struct inode *inode, struct file *file);
971static int ftrace_event_set_open(struct inode *inode, struct file *file);
972
983static const struct seq_operations show_event_seq_ops = { 973static const struct seq_operations show_event_seq_ops = {
984 .start = t_start, 974 .start = t_start,
985 .next = t_next, 975 .next = t_next,
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = {
995}; 985};
996 986
997static const struct file_operations ftrace_avail_fops = { 987static const struct file_operations ftrace_avail_fops = {
998 .open = ftrace_event_seq_open, 988 .open = ftrace_event_avail_open,
999 .read = seq_read, 989 .read = seq_read,
1000 .llseek = seq_lseek, 990 .llseek = seq_lseek,
1001 .release = seq_release, 991 .release = seq_release,
1002}; 992};
1003 993
1004static const struct file_operations ftrace_set_event_fops = { 994static const struct file_operations ftrace_set_event_fops = {
1005 .open = ftrace_event_seq_open, 995 .open = ftrace_event_set_open,
1006 .read = seq_read, 996 .read = seq_read,
1007 .write = ftrace_event_write, 997 .write = ftrace_event_write,
1008 .llseek = seq_lseek, 998 .llseek = seq_lseek,
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void)
1078 return d_events; 1068 return d_events;
1079} 1069}
1080 1070
1071static int
1072ftrace_event_avail_open(struct inode *inode, struct file *file)
1073{
1074 const struct seq_operations *seq_ops = &show_event_seq_ops;
1075
1076 return seq_open(file, seq_ops);
1077}
1078
1079static int
1080ftrace_event_set_open(struct inode *inode, struct file *file)
1081{
1082 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1083
1084 if ((file->f_mode & FMODE_WRITE) &&
1085 (file->f_flags & O_TRUNC))
1086 ftrace_clear_events();
1087
1088 return seq_open(file, seq_ops);
1089}
1090
1081static struct dentry * 1091static struct dentry *
1082event_subsystem_dir(const char *name, struct dentry *d_events) 1092event_subsystem_dir(const char *name, struct dentry *d_events)
1083{ 1093{
@@ -1199,6 +1209,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1199 return 0; 1209 return 0;
1200} 1210}
1201 1211
1212static void event_remove(struct ftrace_event_call *call)
1213{
1214 ftrace_event_enable_disable(call, 0);
1215 if (call->event.funcs)
1216 __unregister_ftrace_event(&call->event);
1217 list_del(&call->list);
1218}
1219
1220static int event_init(struct ftrace_event_call *call)
1221{
1222 int ret = 0;
1223
1224 if (WARN_ON(!call->name))
1225 return -EINVAL;
1226
1227 if (call->class->raw_init) {
1228 ret = call->class->raw_init(call);
1229 if (ret < 0 && ret != -ENOSYS)
1230 pr_warn("Could not initialize trace events/%s\n",
1231 call->name);
1232 }
1233
1234 return ret;
1235}
1236
1202static int 1237static int
1203__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1238__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1204 const struct file_operations *id, 1239 const struct file_operations *id,
@@ -1209,19 +1244,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1209 struct dentry *d_events; 1244 struct dentry *d_events;
1210 int ret; 1245 int ret;
1211 1246
1212 /* The linker may leave blanks */ 1247 ret = event_init(call);
1213 if (!call->name) 1248 if (ret < 0)
1214 return -EINVAL; 1249 return ret;
1215
1216 if (call->class->raw_init) {
1217 ret = call->class->raw_init(call);
1218 if (ret < 0) {
1219 if (ret != -ENOSYS)
1220 pr_warning("Could not initialize trace events/%s\n",
1221 call->name);
1222 return ret;
1223 }
1224 }
1225 1250
1226 d_events = event_trace_events_dir(); 1251 d_events = event_trace_events_dir();
1227 if (!d_events) 1252 if (!d_events)
@@ -1272,13 +1297,10 @@ static void remove_subsystem_dir(const char *name)
1272 */ 1297 */
1273static void __trace_remove_event_call(struct ftrace_event_call *call) 1298static void __trace_remove_event_call(struct ftrace_event_call *call)
1274{ 1299{
1275 ftrace_event_enable_disable(call, 0); 1300 event_remove(call);
1276 if (call->event.funcs)
1277 __unregister_ftrace_event(&call->event);
1278 debugfs_remove_recursive(call->dir);
1279 list_del(&call->list);
1280 trace_destroy_fields(call); 1301 trace_destroy_fields(call);
1281 destroy_preds(call); 1302 destroy_preds(call);
1303 debugfs_remove_recursive(call->dir);
1282 remove_subsystem_dir(call->class->system); 1304 remove_subsystem_dir(call->class->system);
1283} 1305}
1284 1306
@@ -1450,30 +1472,59 @@ static __init int setup_trace_event(char *str)
1450} 1472}
1451__setup("trace_event=", setup_trace_event); 1473__setup("trace_event=", setup_trace_event);
1452 1474
1475static __init int event_trace_enable(void)
1476{
1477 struct ftrace_event_call **iter, *call;
1478 char *buf = bootup_event_buf;
1479 char *token;
1480 int ret;
1481
1482 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
1483
1484 call = *iter;
1485 ret = event_init(call);
1486 if (!ret)
1487 list_add(&call->list, &ftrace_events);
1488 }
1489
1490 while (true) {
1491 token = strsep(&buf, ",");
1492
1493 if (!token)
1494 break;
1495 if (!*token)
1496 continue;
1497
1498 ret = ftrace_set_clr_event(token, 1);
1499 if (ret)
1500 pr_warn("Failed to enable trace event: %s\n", token);
1501 }
1502
1503 trace_printk_start_comm();
1504
1505 return 0;
1506}
1507
1453static __init int event_trace_init(void) 1508static __init int event_trace_init(void)
1454{ 1509{
1455 struct ftrace_event_call **call; 1510 struct ftrace_event_call *call;
1456 struct dentry *d_tracer; 1511 struct dentry *d_tracer;
1457 struct dentry *entry; 1512 struct dentry *entry;
1458 struct dentry *d_events; 1513 struct dentry *d_events;
1459 int ret; 1514 int ret;
1460 char *buf = bootup_event_buf;
1461 char *token;
1462 1515
1463 d_tracer = tracing_init_dentry(); 1516 d_tracer = tracing_init_dentry();
1464 if (!d_tracer) 1517 if (!d_tracer)
1465 return 0; 1518 return 0;
1466 1519
1467 entry = debugfs_create_file("available_events", 0444, d_tracer, 1520 entry = debugfs_create_file("available_events", 0444, d_tracer,
1468 (void *)&show_event_seq_ops, 1521 NULL, &ftrace_avail_fops);
1469 &ftrace_avail_fops);
1470 if (!entry) 1522 if (!entry)
1471 pr_warning("Could not create debugfs " 1523 pr_warning("Could not create debugfs "
1472 "'available_events' entry\n"); 1524 "'available_events' entry\n");
1473 1525
1474 entry = debugfs_create_file("set_event", 0644, d_tracer, 1526 entry = debugfs_create_file("set_event", 0644, d_tracer,
1475 (void *)&show_set_event_seq_ops, 1527 NULL, &ftrace_set_event_fops);
1476 &ftrace_set_event_fops);
1477 if (!entry) 1528 if (!entry)
1478 pr_warning("Could not create debugfs " 1529 pr_warning("Could not create debugfs "
1479 "'set_event' entry\n"); 1530 "'set_event' entry\n");
@@ -1497,24 +1548,19 @@ static __init int event_trace_init(void)
1497 if (trace_define_common_fields()) 1548 if (trace_define_common_fields())
1498 pr_warning("tracing: Failed to allocate common fields"); 1549 pr_warning("tracing: Failed to allocate common fields");
1499 1550
1500 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1551 /*
1501 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, 1552 * Early initialization already enabled ftrace event.
1553 * Now it's only necessary to create the event directory.
1554 */
1555 list_for_each_entry(call, &ftrace_events, list) {
1556
1557 ret = event_create_dir(call, d_events,
1558 &ftrace_event_id_fops,
1502 &ftrace_enable_fops, 1559 &ftrace_enable_fops,
1503 &ftrace_event_filter_fops, 1560 &ftrace_event_filter_fops,
1504 &ftrace_event_format_fops); 1561 &ftrace_event_format_fops);
1505 } 1562 if (ret < 0)
1506 1563 event_remove(call);
1507 while (true) {
1508 token = strsep(&buf, ",");
1509
1510 if (!token)
1511 break;
1512 if (!*token)
1513 continue;
1514
1515 ret = ftrace_set_clr_event(token, 1);
1516 if (ret)
1517 pr_warning("Failed to enable trace event: %s\n", token);
1518 } 1564 }
1519 1565
1520 ret = register_module_notifier(&trace_module_nb); 1566 ret = register_module_notifier(&trace_module_nb);
@@ -1523,6 +1569,7 @@ static __init int event_trace_init(void)
1523 1569
1524 return 0; 1570 return 0;
1525} 1571}
1572core_initcall(event_trace_enable);
1526fs_initcall(event_trace_init); 1573fs_initcall(event_trace_init);
1527 1574
1528#ifdef CONFIG_FTRACE_STARTUP_TEST 1575#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1646,9 +1693,11 @@ static __init void event_trace_self_tests(void)
1646 event_test_stuff(); 1693 event_test_stuff();
1647 1694
1648 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 1695 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
1649 if (WARN_ON_ONCE(ret)) 1696 if (WARN_ON_ONCE(ret)) {
1650 pr_warning("error disabling system %s\n", 1697 pr_warning("error disabling system %s\n",
1651 system->name); 1698 system->name);
1699 continue;
1700 }
1652 1701
1653 pr_cont("OK\n"); 1702 pr_cont("OK\n");
1654 } 1703 }
@@ -1681,7 +1730,8 @@ static __init void event_trace_self_tests(void)
1681static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); 1730static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1682 1731
1683static void 1732static void
1684function_test_events_call(unsigned long ip, unsigned long parent_ip) 1733function_test_events_call(unsigned long ip, unsigned long parent_ip,
1734 struct ftrace_ops *op, struct pt_regs *pt_regs)
1685{ 1735{
1686 struct ring_buffer_event *event; 1736 struct ring_buffer_event *event;
1687 struct ring_buffer *buffer; 1737 struct ring_buffer *buffer;
@@ -1710,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1710 entry->ip = ip; 1760 entry->ip = ip;
1711 entry->parent_ip = parent_ip; 1761 entry->parent_ip = parent_ip;
1712 1762
1713 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1763 trace_buffer_unlock_commit(buffer, event, flags, pc);
1714 1764
1715 out: 1765 out:
1716 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1766 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1720,6 +1770,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1720static struct ftrace_ops trace_ops __initdata = 1770static struct ftrace_ops trace_ops __initdata =
1721{ 1771{
1722 .func = function_test_events_call, 1772 .func = function_test_events_call,
1773 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
1723}; 1774};
1724 1775
1725static __init void event_trace_self_test_with_function(void) 1776static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 431dba8b7542..e5b0ca8b8d4d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps,
1000 } 1000 }
1001 } else { 1001 } else {
1002 if (field->is_signed) 1002 if (field->is_signed)
1003 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = kstrtoll(pred->regex.pattern, 0, &val);
1004 else 1004 else
1005 ret = strict_strtoull(pred->regex.pattern, 0, &val); 1005 ret = kstrtoull(pred->regex.pattern, 0, &val);
1006 if (ret) { 1006 if (ret) {
1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1008 return -EINVAL; 1008 return -EINVAL;
@@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
2002static int __ftrace_function_set_filter(int filter, char *buf, int len, 2002static int __ftrace_function_set_filter(int filter, char *buf, int len,
2003 struct function_filter_data *data) 2003 struct function_filter_data *data)
2004{ 2004{
2005 int i, re_cnt, ret; 2005 int i, re_cnt, ret = -EINVAL;
2006 int *reset; 2006 int *reset;
2007 char **re; 2007 char **re;
2008 2008
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a426f410c060..8e3ad8082ab7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,13 +7,12 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/ring_buffer.h> 12#include <linux/ring_buffer.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/pstore.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18 17
19#include "trace.h" 18#include "trace.h"
@@ -49,7 +48,8 @@ static void function_trace_start(struct trace_array *tr)
49} 48}
50 49
51static void 50static void
52function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) 51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{ 53{
54 struct trace_array *tr = func_trace; 54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
@@ -75,16 +75,17 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
75 preempt_enable_notrace(); 75 preempt_enable_notrace();
76} 76}
77 77
78/* Our two options */ 78/* Our option */
79enum { 79enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 80 TRACE_FUNC_OPT_STACK = 0x1,
81 TRACE_FUNC_OPT_PSTORE = 0x2,
82}; 81};
83 82
84static struct tracer_flags func_flags; 83static struct tracer_flags func_flags;
85 84
86static void 85static void
87function_trace_call(unsigned long ip, unsigned long parent_ip) 86function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
88{ 89{
89 struct trace_array *tr = func_trace; 90 struct trace_array *tr = func_trace;
90 struct trace_array_cpu *data; 91 struct trace_array_cpu *data;
@@ -106,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
106 disabled = atomic_inc_return(&data->disabled); 107 disabled = atomic_inc_return(&data->disabled);
107 108
108 if (likely(disabled == 1)) { 109 if (likely(disabled == 1)) {
109 /*
110 * So far tracing doesn't support multiple buffers, so
111 * we make an explicit call for now.
112 */
113 if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
114 pstore_ftrace_call(ip, parent_ip);
115 pc = preempt_count(); 110 pc = preempt_count();
116 trace_function(tr, ip, parent_ip, flags, pc); 111 trace_function(tr, ip, parent_ip, flags, pc);
117 } 112 }
@@ -121,7 +116,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
121} 116}
122 117
123static void 118static void
124function_stack_trace_call(unsigned long ip, unsigned long parent_ip) 119function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
120 struct ftrace_ops *op, struct pt_regs *pt_regs)
125{ 121{
126 struct trace_array *tr = func_trace; 122 struct trace_array *tr = func_trace;
127 struct trace_array_cpu *data; 123 struct trace_array_cpu *data;
@@ -164,22 +160,19 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
164static struct ftrace_ops trace_ops __read_mostly = 160static struct ftrace_ops trace_ops __read_mostly =
165{ 161{
166 .func = function_trace_call, 162 .func = function_trace_call,
167 .flags = FTRACE_OPS_FL_GLOBAL, 163 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
168}; 164};
169 165
170static struct ftrace_ops trace_stack_ops __read_mostly = 166static struct ftrace_ops trace_stack_ops __read_mostly =
171{ 167{
172 .func = function_stack_trace_call, 168 .func = function_stack_trace_call,
173 .flags = FTRACE_OPS_FL_GLOBAL, 169 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
174}; 170};
175 171
176static struct tracer_opt func_opts[] = { 172static struct tracer_opt func_opts[] = {
177#ifdef CONFIG_STACKTRACE 173#ifdef CONFIG_STACKTRACE
178 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 174 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
179#endif 175#endif
180#ifdef CONFIG_PSTORE_FTRACE
181 { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
182#endif
183 { } /* Always set a last empty entry */ 176 { } /* Always set a last empty entry */
184}; 177};
185 178
@@ -232,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
232 } 225 }
233 226
234 break; 227 break;
235 case TRACE_FUNC_OPT_PSTORE:
236 break;
237 default: 228 default:
238 return -EINVAL; 229 return -EINVAL;
239 } 230 }
@@ -375,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
375 * We use the callback data field (which is a pointer) 366 * We use the callback data field (which is a pointer)
376 * as our counter. 367 * as our counter.
377 */ 368 */
378 ret = strict_strtoul(number, 0, (unsigned long *)&count); 369 ret = kstrtoul(number, 0, (unsigned long *)&count);
379 if (ret) 370 if (ret)
380 return ret; 371 return ret;
381 372
@@ -420,5 +411,4 @@ static __init int init_function_trace(void)
420 init_func_cmd_traceon(); 411 init_func_cmd_traceon();
421 return register_tracer(&function_trace); 412 return register_tracer(&function_trace);
422} 413}
423device_initcall(init_function_trace); 414core_initcall(init_function_trace);
424
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ce27c8ba8d31..4edb4b74eb7e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
143 return; 143 return;
144 } 144 }
145 145
146#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST 146#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
147 /* 147 /*
148 * The arch may choose to record the frame pointer used 148 * The arch may choose to record the frame pointer used
149 * and check it here to make sure that it is what we expect it 149 * and check it here to make sure that it is what we expect it
@@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
154 * 154 *
155 * Currently, x86_32 with optimize for size (-Os) makes the latest 155 * Currently, x86_32 with optimize for size (-Os) makes the latest
156 * gcc do the above. 156 * gcc do the above.
157 *
158 * Note, -mfentry does not use frame pointers, and this test
159 * is not needed if CC_USING_FENTRY is set.
157 */ 160 */
158 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 161 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
159 ftrace_graph_stop(); 162 ftrace_graph_stop();
@@ -220,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr,
220 entry = ring_buffer_event_data(event); 223 entry = ring_buffer_event_data(event);
221 entry->graph_ent = *trace; 224 entry->graph_ent = *trace;
222 if (!filter_current_check_discard(buffer, call, entry, event)) 225 if (!filter_current_check_discard(buffer, call, entry, event))
223 ring_buffer_unlock_commit(buffer, event); 226 __buffer_unlock_commit(buffer, event);
224 227
225 return 1; 228 return 1;
226} 229}
@@ -324,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr,
324 entry = ring_buffer_event_data(event); 327 entry = ring_buffer_event_data(event);
325 entry->ret = *trace; 328 entry->ret = *trace;
326 if (!filter_current_check_discard(buffer, call, entry, event)) 329 if (!filter_current_check_discard(buffer, call, entry, event))
327 ring_buffer_unlock_commit(buffer, event); 330 __buffer_unlock_commit(buffer, event);
328} 331}
329 332
330void trace_graph_return(struct ftrace_graph_ret *trace) 333void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1471,4 +1474,4 @@ static __init int init_graph_trace(void)
1471 return register_tracer(&graph_trace); 1474 return register_tracer(&graph_trace);
1472} 1475}
1473 1476
1474device_initcall(init_graph_trace); 1477core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 99d20e920368..713a2cac4881 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
7 * From code in the latency_tracer, that is: 7 * From code in the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr,
136 * irqsoff uses its own tracer function to keep the overhead down: 136 * irqsoff uses its own tracer function to keep the overhead down:
137 */ 137 */
138static void 138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
140 struct ftrace_ops *op, struct pt_regs *pt_regs)
140{ 141{
141 struct trace_array *tr = irqsoff_trace; 142 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data; 143 struct trace_array_cpu *data;
@@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 154static struct ftrace_ops trace_ops __read_mostly =
154{ 155{
155 .func = irqsoff_tracer_call, 156 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL, 157 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
157}; 158};
158#endif /* CONFIG_FUNCTION_TRACER */ 159#endif /* CONFIG_FUNCTION_TRACER */
159 160
@@ -603,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly =
603 .reset = irqsoff_tracer_reset, 604 .reset = irqsoff_tracer_reset,
604 .start = irqsoff_tracer_start, 605 .start = irqsoff_tracer_start,
605 .stop = irqsoff_tracer_stop, 606 .stop = irqsoff_tracer_stop,
606 .print_max = 1, 607 .print_max = true,
607 .print_header = irqsoff_print_header, 608 .print_header = irqsoff_print_header,
608 .print_line = irqsoff_print_line, 609 .print_line = irqsoff_print_line,
609 .flags = &tracer_flags, 610 .flags = &tracer_flags,
@@ -613,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly =
613#endif 614#endif
614 .open = irqsoff_trace_open, 615 .open = irqsoff_trace_open,
615 .close = irqsoff_trace_close, 616 .close = irqsoff_trace_close,
616 .use_max_tr = 1, 617 .use_max_tr = true,
617}; 618};
618# define register_irqsoff(trace) register_tracer(&trace) 619# define register_irqsoff(trace) register_tracer(&trace)
619#else 620#else
@@ -636,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly =
636 .reset = irqsoff_tracer_reset, 637 .reset = irqsoff_tracer_reset,
637 .start = irqsoff_tracer_start, 638 .start = irqsoff_tracer_start,
638 .stop = irqsoff_tracer_stop, 639 .stop = irqsoff_tracer_stop,
639 .print_max = 1, 640 .print_max = true,
640 .print_header = irqsoff_print_header, 641 .print_header = irqsoff_print_header,
641 .print_line = irqsoff_print_line, 642 .print_line = irqsoff_print_line,
642 .flags = &tracer_flags, 643 .flags = &tracer_flags,
@@ -646,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly =
646#endif 647#endif
647 .open = irqsoff_trace_open, 648 .open = irqsoff_trace_open,
648 .close = irqsoff_trace_close, 649 .close = irqsoff_trace_close,
649 .use_max_tr = 1, 650 .use_max_tr = true,
650}; 651};
651# define register_preemptoff(trace) register_tracer(&trace) 652# define register_preemptoff(trace) register_tracer(&trace)
652#else 653#else
@@ -671,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
671 .reset = irqsoff_tracer_reset, 672 .reset = irqsoff_tracer_reset,
672 .start = irqsoff_tracer_start, 673 .start = irqsoff_tracer_start,
673 .stop = irqsoff_tracer_stop, 674 .stop = irqsoff_tracer_stop,
674 .print_max = 1, 675 .print_max = true,
675 .print_header = irqsoff_print_header, 676 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line, 677 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags, 678 .flags = &tracer_flags,
@@ -681,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
681#endif 682#endif
682 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
684 .use_max_tr = 1, 685 .use_max_tr = true,
685}; 686};
686 687
687# define register_preemptirqsoff(trace) register_tracer(&trace) 688# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -697,4 +698,4 @@ __init static int init_irqsoff_tracer(void)
697 698
698 return 0; 699 return 0;
699} 700}
700device_initcall(init_irqsoff_tracer); 701core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a2117043bb1..1865d5f76538 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* an address specified */ 446 /* an address specified */
447 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); 447 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
448 if (ret) { 448 if (ret) {
449 pr_info("Failed to parse address.\n"); 449 pr_info("Failed to parse address.\n");
450 return ret; 450 return ret;
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 752
753 if (!filter_current_check_discard(buffer, call, entry, event)) 753 if (!filter_current_check_discard(buffer, call, entry, event))
754 trace_nowake_buffer_unlock_commit_regs(buffer, event, 754 trace_buffer_unlock_commit_regs(buffer, event,
755 irq_flags, pc, regs); 755 irq_flags, pc, regs);
756} 756}
757 757
758/* Kretprobe handler */ 758/* Kretprobe handler */
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
785 785
786 if (!filter_current_check_discard(buffer, call, entry, event)) 786 if (!filter_current_check_discard(buffer, call, entry, event))
787 trace_nowake_buffer_unlock_commit_regs(buffer, event, 787 trace_buffer_unlock_commit_regs(buffer, event,
788 irq_flags, pc, regs); 788 irq_flags, pc, regs);
789} 789}
790 790
791/* Event entry printers */ 791/* Event entry printers */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 123b189c732c..194d79602dc7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
610 return trace_print_lat_fmt(s, entry); 610 return trace_print_lat_fmt(s, entry);
611} 611}
612 612
613static unsigned long preempt_mark_thresh = 100; 613static unsigned long preempt_mark_thresh_us = 100;
614 614
615static int 615static int
616lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, 616lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617 unsigned long rel_usecs)
618{ 617{
619 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, 618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
620 rel_usecs > preempt_mark_thresh ? '!' : 619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
621 rel_usecs > 1 ? '+' : ' '); 620 unsigned long long abs_ts = iter->ts - iter->tr->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq;
623
624 if (in_ns) {
625 abs_ts = ns2usecs(abs_ts);
626 rel_ts = ns2usecs(rel_ts);
627 }
628
629 if (verbose && in_ns) {
630 unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
631 unsigned long abs_msec = (unsigned long)abs_ts;
632 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
633 unsigned long rel_msec = (unsigned long)rel_ts;
634
635 return trace_seq_printf(
636 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
637 ns2usecs(iter->ts),
638 abs_msec, abs_usec,
639 rel_msec, rel_usec);
640 } else if (verbose && !in_ns) {
641 return trace_seq_printf(
642 s, "[%016llx] %lld (+%lld): ",
643 iter->ts, abs_ts, rel_ts);
644 } else if (!verbose && in_ns) {
645 return trace_seq_printf(
646 s, " %4lldus%c: ",
647 abs_ts,
648 rel_ts > preempt_mark_thresh_us ? '!' :
649 rel_ts > 1 ? '+' : ' ');
650 } else { /* !verbose && !in_ns */
651 return trace_seq_printf(s, " %4lld: ", abs_ts);
652 }
622} 653}
623 654
624int trace_print_context(struct trace_iterator *iter) 655int trace_print_context(struct trace_iterator *iter)
625{ 656{
626 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
627 struct trace_entry *entry = iter->ent; 658 struct trace_entry *entry = iter->ent;
628 unsigned long long t = ns2usecs(iter->ts); 659 unsigned long long t;
629 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 660 unsigned long secs, usec_rem;
630 unsigned long secs = (unsigned long)t;
631 char comm[TASK_COMM_LEN]; 661 char comm[TASK_COMM_LEN];
632 int ret; 662 int ret;
633 663
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter)
644 return 0; 674 return 0;
645 } 675 }
646 676
647 return trace_seq_printf(s, " %5lu.%06lu: ", 677 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
648 secs, usec_rem); 678 t = ns2usecs(iter->ts);
679 usec_rem = do_div(t, USEC_PER_SEC);
680 secs = (unsigned long)t;
681 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
682 } else
683 return trace_seq_printf(s, " %12llu: ", iter->ts);
649} 684}
650 685
651int trace_print_lat_context(struct trace_iterator *iter) 686int trace_print_lat_context(struct trace_iterator *iter)
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter)
659 *next_entry = trace_find_next_entry(iter, NULL, 694 *next_entry = trace_find_next_entry(iter, NULL,
660 &next_ts); 695 &next_ts);
661 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 696 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
663 unsigned long rel_usecs;
664 697
665 /* Restore the original ent_size */ 698 /* Restore the original ent_size */
666 iter->ent_size = ent_size; 699 iter->ent_size = ent_size;
667 700
668 if (!next_entry) 701 if (!next_entry)
669 next_ts = iter->ts; 702 next_ts = iter->ts;
670 rel_usecs = ns2usecs(next_ts - iter->ts);
671 703
672 if (verbose) { 704 if (verbose) {
673 char comm[TASK_COMM_LEN]; 705 char comm[TASK_COMM_LEN];
674 706
675 trace_find_cmdline(entry->pid, comm); 707 trace_find_cmdline(entry->pid, comm);
676 708
677 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" 709 ret = trace_seq_printf(
678 " %ld.%03ldms (+%ld.%03ldms): ", comm, 710 s, "%16s %5d %3d %d %08x %08lx ",
679 entry->pid, iter->cpu, entry->flags, 711 comm, entry->pid, iter->cpu, entry->flags,
680 entry->preempt_count, iter->idx, 712 entry->preempt_count, iter->idx);
681 ns2usecs(iter->ts),
682 abs_usecs / USEC_PER_MSEC,
683 abs_usecs % USEC_PER_MSEC,
684 rel_usecs / USEC_PER_MSEC,
685 rel_usecs % USEC_PER_MSEC);
686 } else { 713 } else {
687 ret = lat_print_generic(s, entry, iter->cpu); 714 ret = lat_print_generic(s, entry, iter->cpu);
688 if (ret)
689 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
690 } 715 }
691 716
717 if (ret)
718 ret = lat_print_timestamp(iter, next_ts);
719
692 return ret; 720 return ret;
693} 721}
694 722
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daa9980153af..412e959709b4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type)
441 goto fail; 441 goto fail;
442 442
443 type++; 443 type++;
444 if (strict_strtoul(type, 0, &bs)) 444 if (kstrtoul(type, 0, &bs))
445 goto fail; 445 goto fail;
446 446
447 switch (bs) { 447 switch (bs) {
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
501 501
502 tmp = strchr(symbol, '+'); 502 tmp = strchr(symbol, '+');
503 if (tmp) { 503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */ 504 /* skip sign because kstrtoul doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset); 505 ret = kstrtoul(tmp + 1, 0, offset);
506 if (ret) 506 if (ret)
507 return ret; 507 return ret;
508 508
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
533 else 533 else
534 ret = -EINVAL; 534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) { 535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param); 536 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL; 538 ret = -EINVAL;
539 else { 539 else {
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
579 579
580 case '@': /* memory or symbol */ 580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) { 581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param); 582 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 583 if (ret)
584 break; 584 break;
585 585
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
597 break; 597 break;
598 598
599 case '+': /* deref memory */ 599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */ 600 arg++; /* Skip '+', because kstrtol() rejects it. */
601 case '-': 601 case '-':
602 tmp = strchr(arg, '('); 602 tmp = strchr(arg, '(');
603 if (!tmp) 603 if (!tmp)
604 break; 604 break;
605 605
606 *tmp = '\0'; 606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset); 607 ret = kstrtol(arg, 0, &offset);
608 608
609 if (ret) 609 if (ret)
610 break; 610 break;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a18456..3374c792ccd8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!filter_check_discard(call, entry, buffer, event))
105 ring_buffer_unlock_commit(buffer, event); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106 ftrace_trace_stack(tr->buffer, flags, 6, pc);
107 ftrace_trace_userstack(tr->buffer, flags, pc);
108} 106}
109 107
110static void 108static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ff791ea48b57..9fe45fcefca0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
@@ -108,7 +108,8 @@ out_enable:
108 * wakeup uses its own tracer function to keep the overhead down: 108 * wakeup uses its own tracer function to keep the overhead down:
109 */ 109 */
110static void 110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
112 struct ftrace_ops *op, struct pt_regs *pt_regs)
112{ 113{
113 struct trace_array *tr = wakeup_trace; 114 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data; 115 struct trace_array_cpu *data;
@@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 130static struct ftrace_ops trace_ops __read_mostly =
130{ 131{
131 .func = wakeup_tracer_call, 132 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL, 133 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
133}; 134};
134#endif /* CONFIG_FUNCTION_TRACER */ 135#endif /* CONFIG_FUNCTION_TRACER */
135 136
@@ -588,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly =
588 .reset = wakeup_tracer_reset, 589 .reset = wakeup_tracer_reset,
589 .start = wakeup_tracer_start, 590 .start = wakeup_tracer_start,
590 .stop = wakeup_tracer_stop, 591 .stop = wakeup_tracer_stop,
591 .print_max = 1, 592 .print_max = true,
592 .print_header = wakeup_print_header, 593 .print_header = wakeup_print_header,
593 .print_line = wakeup_print_line, 594 .print_line = wakeup_print_line,
594 .flags = &tracer_flags, 595 .flags = &tracer_flags,
@@ -598,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly =
598#endif 599#endif
599 .open = wakeup_trace_open, 600 .open = wakeup_trace_open,
600 .close = wakeup_trace_close, 601 .close = wakeup_trace_close,
601 .use_max_tr = 1, 602 .use_max_tr = true,
602}; 603};
603 604
604static struct tracer wakeup_rt_tracer __read_mostly = 605static struct tracer wakeup_rt_tracer __read_mostly =
@@ -609,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
609 .start = wakeup_tracer_start, 610 .start = wakeup_tracer_start,
610 .stop = wakeup_tracer_stop, 611 .stop = wakeup_tracer_stop,
611 .wait_pipe = poll_wait_pipe, 612 .wait_pipe = poll_wait_pipe,
612 .print_max = 1, 613 .print_max = true,
613 .print_header = wakeup_print_header, 614 .print_header = wakeup_print_header,
614 .print_line = wakeup_print_line, 615 .print_line = wakeup_print_line,
615 .flags = &tracer_flags, 616 .flags = &tracer_flags,
@@ -619,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
619#endif 620#endif
620 .open = wakeup_trace_open, 621 .open = wakeup_trace_open,
621 .close = wakeup_trace_close, 622 .close = wakeup_trace_close,
622 .use_max_tr = 1, 623 .use_max_tr = true,
623}; 624};
624 625
625__init static int init_wakeup_tracer(void) 626__init static int init_wakeup_tracer(void)
@@ -636,4 +637,4 @@ __init static int init_wakeup_tracer(void)
636 637
637 return 0; 638 return 0;
638} 639}
639device_initcall(init_wakeup_tracer); 640core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 288541f977fb..47623169a815 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
103 103
104static int trace_selftest_test_probe1_cnt; 104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip, 105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip) 106 unsigned long pip,
107 struct ftrace_ops *op,
108 struct pt_regs *pt_regs)
107{ 109{
108 trace_selftest_test_probe1_cnt++; 110 trace_selftest_test_probe1_cnt++;
109} 111}
110 112
111static int trace_selftest_test_probe2_cnt; 113static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip, 114static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip) 115 unsigned long pip,
116 struct ftrace_ops *op,
117 struct pt_regs *pt_regs)
114{ 118{
115 trace_selftest_test_probe2_cnt++; 119 trace_selftest_test_probe2_cnt++;
116} 120}
117 121
118static int trace_selftest_test_probe3_cnt; 122static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip, 123static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip) 124 unsigned long pip,
125 struct ftrace_ops *op,
126 struct pt_regs *pt_regs)
121{ 127{
122 trace_selftest_test_probe3_cnt++; 128 trace_selftest_test_probe3_cnt++;
123} 129}
124 130
125static int trace_selftest_test_global_cnt; 131static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip, 132static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip) 133 unsigned long pip,
134 struct ftrace_ops *op,
135 struct pt_regs *pt_regs)
128{ 136{
129 trace_selftest_test_global_cnt++; 137 trace_selftest_test_global_cnt++;
130} 138}
131 139
132static int trace_selftest_test_dyn_cnt; 140static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip, 141static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip) 142 unsigned long pip,
143 struct ftrace_ops *op,
144 struct pt_regs *pt_regs)
135{ 145{
136 trace_selftest_test_dyn_cnt++; 146 trace_selftest_test_dyn_cnt++;
137} 147}
138 148
139static struct ftrace_ops test_probe1 = { 149static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func, 150 .func = trace_selftest_test_probe1_func,
151 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
141}; 152};
142 153
143static struct ftrace_ops test_probe2 = { 154static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func, 155 .func = trace_selftest_test_probe2_func,
156 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
145}; 157};
146 158
147static struct ftrace_ops test_probe3 = { 159static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func, 160 .func = trace_selftest_test_probe3_func,
161 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
149}; 162};
150 163
151static struct ftrace_ops test_global = { 164static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func, 165 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL, 166 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
154}; 167};
155 168
156static void print_counts(void) 169static void print_counts(void)
@@ -307,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
307 int (*func)(void)) 320 int (*func)(void))
308{ 321{
309 int save_ftrace_enabled = ftrace_enabled; 322 int save_ftrace_enabled = ftrace_enabled;
310 int save_tracer_enabled = tracer_enabled;
311 unsigned long count; 323 unsigned long count;
312 char *func_name; 324 char *func_name;
313 int ret; 325 int ret;
@@ -318,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
318 330
319 /* enable tracing, and record the filter function */ 331 /* enable tracing, and record the filter function */
320 ftrace_enabled = 1; 332 ftrace_enabled = 1;
321 tracer_enabled = 1;
322 333
323 /* passed in by parameter to fool gcc from optimizing */ 334 /* passed in by parameter to fool gcc from optimizing */
324 func(); 335 func();
@@ -382,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
382 393
383 out: 394 out:
384 ftrace_enabled = save_ftrace_enabled; 395 ftrace_enabled = save_ftrace_enabled;
385 tracer_enabled = save_tracer_enabled;
386 396
387 /* Enable tracing on all functions again */ 397 /* Enable tracing on all functions again */
388 ftrace_set_global_filter(NULL, 0, 1); 398 ftrace_set_global_filter(NULL, 0, 1);
@@ -393,10 +403,247 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
393 403
394 return ret; 404 return ret;
395} 405}
406
407static int trace_selftest_recursion_cnt;
408static void trace_selftest_test_recursion_func(unsigned long ip,
409 unsigned long pip,
410 struct ftrace_ops *op,
411 struct pt_regs *pt_regs)
412{
413 /*
414 * This function is registered without the recursion safe flag.
415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel!
417 */
418 trace_selftest_recursion_cnt++;
419 DYN_FTRACE_TEST_NAME();
420}
421
422static void trace_selftest_test_recursion_safe_func(unsigned long ip,
423 unsigned long pip,
424 struct ftrace_ops *op,
425 struct pt_regs *pt_regs)
426{
427 /*
428 * We said we would provide our own recursion. By calling
429 * this function again, we should recurse back into this function
430 * and count again. But this only happens if the arch supports
431 * all of ftrace features and nothing else is using the function
432 * tracing utility.
433 */
434 if (trace_selftest_recursion_cnt++)
435 return;
436 DYN_FTRACE_TEST_NAME();
437}
438
439static struct ftrace_ops test_rec_probe = {
440 .func = trace_selftest_test_recursion_func,
441};
442
443static struct ftrace_ops test_recsafe_probe = {
444 .func = trace_selftest_test_recursion_safe_func,
445 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
446};
447
448static int
449trace_selftest_function_recursion(void)
450{
451 int save_ftrace_enabled = ftrace_enabled;
452 char *func_name;
453 int len;
454 int ret;
455 int cnt;
456
457 /* The previous test PASSED */
458 pr_cont("PASSED\n");
459 pr_info("Testing ftrace recursion: ");
460
461
462 /* enable tracing, and record the filter function */
463 ftrace_enabled = 1;
464
465 /* Handle PPC64 '.' name */
466 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
467 len = strlen(func_name);
468
469 ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
470 if (ret) {
471 pr_cont("*Could not set filter* ");
472 goto out;
473 }
474
475 ret = register_ftrace_function(&test_rec_probe);
476 if (ret) {
477 pr_cont("*could not register callback* ");
478 goto out;
479 }
480
481 DYN_FTRACE_TEST_NAME();
482
483 unregister_ftrace_function(&test_rec_probe);
484
485 ret = -1;
486 if (trace_selftest_recursion_cnt != 1) {
487 pr_cont("*callback not called once (%d)* ",
488 trace_selftest_recursion_cnt);
489 goto out;
490 }
491
492 trace_selftest_recursion_cnt = 1;
493
494 pr_cont("PASSED\n");
495 pr_info("Testing ftrace recursion safe: ");
496
497 ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
498 if (ret) {
499 pr_cont("*Could not set filter* ");
500 goto out;
501 }
502
503 ret = register_ftrace_function(&test_recsafe_probe);
504 if (ret) {
505 pr_cont("*could not register callback* ");
506 goto out;
507 }
508
509 DYN_FTRACE_TEST_NAME();
510
511 unregister_ftrace_function(&test_recsafe_probe);
512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) {
524 pr_cont("*callback not called expected %d times (%d)* ",
525 cnt, trace_selftest_recursion_cnt);
526 goto out;
527 }
528
529 ret = 0;
530out:
531 ftrace_enabled = save_ftrace_enabled;
532
533 return ret;
534}
396#else 535#else
397# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) 536# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
537# define trace_selftest_function_recursion() ({ 0; })
398#endif /* CONFIG_DYNAMIC_FTRACE */ 538#endif /* CONFIG_DYNAMIC_FTRACE */
399 539
540static enum {
541 TRACE_SELFTEST_REGS_START,
542 TRACE_SELFTEST_REGS_FOUND,
543 TRACE_SELFTEST_REGS_NOT_FOUND,
544} trace_selftest_regs_stat;
545
546static void trace_selftest_test_regs_func(unsigned long ip,
547 unsigned long pip,
548 struct ftrace_ops *op,
549 struct pt_regs *pt_regs)
550{
551 if (pt_regs)
552 trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
553 else
554 trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
555}
556
557static struct ftrace_ops test_regs_probe = {
558 .func = trace_selftest_test_regs_func,
559 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
560};
561
562static int
563trace_selftest_function_regs(void)
564{
565 int save_ftrace_enabled = ftrace_enabled;
566 char *func_name;
567 int len;
568 int ret;
569 int supported = 0;
570
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
572 supported = 1;
573#endif
574
575 /* The previous test PASSED */
576 pr_cont("PASSED\n");
577 pr_info("Testing ftrace regs%s: ",
578 !supported ? "(no arch support)" : "");
579
580 /* enable tracing, and record the filter function */
581 ftrace_enabled = 1;
582
583 /* Handle PPC64 '.' name */
584 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
585 len = strlen(func_name);
586
587 ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
588 /*
589 * If DYNAMIC_FTRACE is not set, then we just trace all functions.
590 * This test really doesn't care.
591 */
592 if (ret && ret != -ENODEV) {
593 pr_cont("*Could not set filter* ");
594 goto out;
595 }
596
597 ret = register_ftrace_function(&test_regs_probe);
598 /*
599 * Now if the arch does not support passing regs, then this should
600 * have failed.
601 */
602 if (!supported) {
603 if (!ret) {
604 pr_cont("*registered save-regs without arch support* ");
605 goto out;
606 }
607 test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
608 ret = register_ftrace_function(&test_regs_probe);
609 }
610 if (ret) {
611 pr_cont("*could not register callback* ");
612 goto out;
613 }
614
615
616 DYN_FTRACE_TEST_NAME();
617
618 unregister_ftrace_function(&test_regs_probe);
619
620 ret = -1;
621
622 switch (trace_selftest_regs_stat) {
623 case TRACE_SELFTEST_REGS_START:
624 pr_cont("*callback never called* ");
625 goto out;
626
627 case TRACE_SELFTEST_REGS_FOUND:
628 if (supported)
629 break;
630 pr_cont("*callback received regs without arch support* ");
631 goto out;
632
633 case TRACE_SELFTEST_REGS_NOT_FOUND:
634 if (!supported)
635 break;
636 pr_cont("*callback received NULL regs* ");
637 goto out;
638 }
639
640 ret = 0;
641out:
642 ftrace_enabled = save_ftrace_enabled;
643
644 return ret;
645}
646
400/* 647/*
401 * Simple verification test of ftrace function tracer. 648 * Simple verification test of ftrace function tracer.
402 * Enable ftrace, sleep 1/10 second, and then read the trace 649 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -406,7 +653,6 @@ int
406trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 653trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
407{ 654{
408 int save_ftrace_enabled = ftrace_enabled; 655 int save_ftrace_enabled = ftrace_enabled;
409 int save_tracer_enabled = tracer_enabled;
410 unsigned long count; 656 unsigned long count;
411 int ret; 657 int ret;
412 658
@@ -415,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
415 661
416 /* start the tracing */ 662 /* start the tracing */
417 ftrace_enabled = 1; 663 ftrace_enabled = 1;
418 tracer_enabled = 1;
419 664
420 ret = tracer_init(trace, tr); 665 ret = tracer_init(trace, tr);
421 if (ret) { 666 if (ret) {
@@ -442,10 +687,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
442 687
443 ret = trace_selftest_startup_dynamic_tracing(trace, tr, 688 ret = trace_selftest_startup_dynamic_tracing(trace, tr,
444 DYN_FTRACE_TEST_NAME); 689 DYN_FTRACE_TEST_NAME);
690 if (ret)
691 goto out;
692
693 ret = trace_selftest_function_recursion();
694 if (ret)
695 goto out;
445 696
697 ret = trace_selftest_function_regs();
446 out: 698 out:
447 ftrace_enabled = save_ftrace_enabled; 699 ftrace_enabled = save_ftrace_enabled;
448 tracer_enabled = save_tracer_enabled;
449 700
450 /* kill ftrace totally if we failed */ 701 /* kill ftrace totally if we failed */
451 if (ret) 702 if (ret)
@@ -778,6 +1029,8 @@ static int trace_wakeup_test_thread(void *data)
778 set_current_state(TASK_INTERRUPTIBLE); 1029 set_current_state(TASK_INTERRUPTIBLE);
779 schedule(); 1030 schedule();
780 1031
1032 complete(x);
1033
781 /* we are awake, now wait to disappear */ 1034 /* we are awake, now wait to disappear */
782 while (!kthread_should_stop()) { 1035 while (!kthread_should_stop()) {
783 /* 1036 /*
@@ -821,29 +1074,27 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
821 /* reset the max latency */ 1074 /* reset the max latency */
822 tracing_max_latency = 0; 1075 tracing_max_latency = 0;
823 1076
824 /* sleep to let the RT thread sleep too */ 1077 while (p->on_rq) {
825 msleep(100); 1078 /*
1079 * Sleep to make sure the RT thread is asleep too.
1080 * On virtual machines we can't rely on timings,
1081 * but we want to make sure this test still works.
1082 */
1083 msleep(100);
1084 }
826 1085
827 /* 1086 init_completion(&isrt);
828 * Yes this is slightly racy. It is possible that for some
829 * strange reason that the RT thread we created, did not
830 * call schedule for 100ms after doing the completion,
831 * and we do a wakeup on a task that already is awake.
832 * But that is extremely unlikely, and the worst thing that
833 * happens in such a case, is that we disable tracing.
834 * Honestly, if this race does happen something is horrible
835 * wrong with the system.
836 */
837 1087
838 wake_up_process(p); 1088 wake_up_process(p);
839 1089
840 /* give a little time to let the thread wake up */ 1090 /* Wait for the task to wake up */
841 msleep(100); 1091 wait_for_completion(&isrt);
842 1092
843 /* stop the tracing. */ 1093 /* stop the tracing. */
844 tracing_stop(); 1094 tracing_stop();
845 /* check both trace buffers */ 1095 /* check both trace buffers */
846 ret = trace_test_buffer(tr, NULL); 1096 ret = trace_test_buffer(tr, NULL);
1097 printk("ret = %d\n", ret);
847 if (!ret) 1098 if (!ret)
848 ret = trace_test_buffer(&max_tr, &count); 1099 ret = trace_test_buffer(&max_tr, &count);
849 1100
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d4545f49242e..42ca822fc701 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
33static arch_spinlock_t max_stack_lock = 33static arch_spinlock_t max_stack_lock =
34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
35 35
36static int stack_trace_disabled __read_mostly;
37static DEFINE_PER_CPU(int, trace_active); 36static DEFINE_PER_CPU(int, trace_active);
38static DEFINE_MUTEX(stack_sysctl_mutex); 37static DEFINE_MUTEX(stack_sysctl_mutex);
39 38
@@ -111,13 +110,11 @@ static inline void check_stack(void)
111} 110}
112 111
113static void 112static void
114stack_trace_call(unsigned long ip, unsigned long parent_ip) 113stack_trace_call(unsigned long ip, unsigned long parent_ip,
114 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 115{
116 int cpu; 116 int cpu;
117 117
118 if (unlikely(!ftrace_enabled || stack_trace_disabled))
119 return;
120
121 preempt_disable_notrace(); 118 preempt_disable_notrace();
122 119
123 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
@@ -136,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
136static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
137{ 134{
138 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
139}; 137};
140 138
141static ssize_t 139static ssize_t
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 6b245f64c8dd..7609dd6714c2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event,
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type, void *data); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call);
26
27static struct list_head * 24static struct list_head *
28syscall_get_enter_fields(struct ftrace_event_call *call) 25syscall_get_enter_fields(struct ftrace_event_call *call)
29{ 26{
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
32 return &entry->enter_fields; 29 return &entry->enter_fields;
33} 30}
34 31
35struct trace_event_functions enter_syscall_print_funcs = {
36 .trace = print_syscall_enter,
37};
38
39struct trace_event_functions exit_syscall_print_funcs = {
40 .trace = print_syscall_exit,
41};
42
43struct ftrace_event_class event_class_syscall_enter = {
44 .system = "syscalls",
45 .reg = syscall_enter_register,
46 .define_fields = syscall_enter_define_fields,
47 .get_fields = syscall_get_enter_fields,
48 .raw_init = init_syscall_trace,
49};
50
51struct ftrace_event_class event_class_syscall_exit = {
52 .system = "syscalls",
53 .reg = syscall_exit_register,
54 .define_fields = syscall_exit_define_fields,
55 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
56 .raw_init = init_syscall_trace,
57};
58
59extern struct syscall_metadata *__start_syscalls_metadata[]; 32extern struct syscall_metadata *__start_syscalls_metadata[];
60extern struct syscall_metadata *__stop_syscalls_metadata[]; 33extern struct syscall_metadata *__stop_syscalls_metadata[];
61 34
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432 mutex_unlock(&syscall_trace_lock); 405 mutex_unlock(&syscall_trace_lock);
433} 406}
434 407
435int init_syscall_trace(struct ftrace_event_call *call) 408static int init_syscall_trace(struct ftrace_event_call *call)
436{ 409{
437 int id; 410 int id;
438 int num; 411 int num;
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call)
457 return id; 430 return id;
458} 431}
459 432
433struct trace_event_functions enter_syscall_print_funcs = {
434 .trace = print_syscall_enter,
435};
436
437struct trace_event_functions exit_syscall_print_funcs = {
438 .trace = print_syscall_exit,
439};
440
441struct ftrace_event_class event_class_syscall_enter = {
442 .system = "syscalls",
443 .reg = syscall_enter_register,
444 .define_fields = syscall_enter_define_fields,
445 .get_fields = syscall_get_enter_fields,
446 .raw_init = init_syscall_trace,
447};
448
449struct ftrace_event_class event_class_syscall_exit = {
450 .system = "syscalls",
451 .reg = syscall_exit_register,
452 .define_fields = syscall_exit_define_fields,
453 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
454 .raw_init = init_syscall_trace,
455};
456
460unsigned long __init __weak arch_syscall_addr(int nr) 457unsigned long __init __weak arch_syscall_addr(int nr)
461{ 458{
462 return (unsigned long)sys_call_table[nr]; 459 return (unsigned long)sys_call_table[nr];
@@ -487,7 +484,7 @@ int __init init_ftrace_syscalls(void)
487 484
488 return 0; 485 return 0;
489} 486}
490core_initcall(init_ftrace_syscalls); 487early_initcall(init_ftrace_syscalls);
491 488
492#ifdef CONFIG_PERF_EVENTS 489#ifdef CONFIG_PERF_EVENTS
493 490
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
538} 535}
539 536
540int perf_sysenter_enable(struct ftrace_event_call *call) 537static int perf_sysenter_enable(struct ftrace_event_call *call)
541{ 538{
542 int ret = 0; 539 int ret = 0;
543 int num; 540 int num;
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
558 return ret; 555 return ret;
559} 556}
560 557
561void perf_sysenter_disable(struct ftrace_event_call *call) 558static void perf_sysenter_disable(struct ftrace_event_call *call)
562{ 559{
563 int num; 560 int num;
564 561
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 612 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
616} 613}
617 614
618int perf_sysexit_enable(struct ftrace_event_call *call) 615static int perf_sysexit_enable(struct ftrace_event_call *call)
619{ 616{
620 int ret = 0; 617 int ret = 0;
621 int num; 618 int num;
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
636 return ret; 633 return ret;
637} 634}
638 635
639void perf_sysexit_disable(struct ftrace_event_call *call) 636static void perf_sysexit_disable(struct ftrace_event_call *call)
640{ 637{
641 int num; 638 int num;
642 639
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 03003cd7dd96..c86e6d4f67fb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/uprobes.h> 23#include <linux/uprobes.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/string.h>
25 26
26#include "trace_probe.h" 27#include "trace_probe.h"
27 28
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv)
189 if (argv[0][0] == '-') 190 if (argv[0][0] == '-')
190 is_delete = true; 191 is_delete = true;
191 else if (argv[0][0] != 'p') { 192 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); 193 pr_info("Probe definition must be started with 'p' or '-'.\n");
193 return -EINVAL; 194 return -EINVAL;
194 } 195 }
195 196
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv)
252 if (ret) 253 if (ret)
253 goto fail_address_parse; 254 goto fail_address_parse;
254 255
255 ret = strict_strtoul(arg, 0, &offset); 256 ret = kstrtoul(arg, 0, &offset);
256 if (ret) 257 if (ret)
257 goto fail_address_parse; 258 goto fail_address_parse;
258 259
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv)
263 264
264 /* setup a probe */ 265 /* setup a probe */
265 if (!event) { 266 if (!event) {
266 char *tail = strrchr(filename, '/'); 267 char *tail;
267 char *ptr; 268 char *ptr;
268 269
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); 270 tail = kstrdup(kbasename(filename), GFP_KERNEL);
270 if (!ptr) { 271 if (!tail) {
271 ret = -ENOMEM; 272 ret = -ENOMEM;
272 goto fail_address_parse; 273 goto fail_address_parse;
273 } 274 }
274 275
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_"); 276 ptr = strpbrk(tail, ".-_");
277 if (ptr) 277 if (ptr)
278 *ptr = '\0'; 278 *ptr = '\0';
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 23b4d784ebdd..625df0b44690 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,7 +26,9 @@
26/* 26/*
27 * fill in basic accounting fields 27 * fill in basic accounting fields
28 */ 28 */
29void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 29void bacct_add_tsk(struct user_namespace *user_ns,
30 struct pid_namespace *pid_ns,
31 struct taskstats *stats, struct task_struct *tsk)
30{ 32{
31 const struct cred *tcred; 33 const struct cred *tcred;
32 struct timespec uptime, ts; 34 struct timespec uptime, ts;
@@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
55 stats->ac_flag |= AXSIG; 57 stats->ac_flag |= AXSIG;
56 stats->ac_nice = task_nice(tsk); 58 stats->ac_nice = task_nice(tsk);
57 stats->ac_sched = tsk->policy; 59 stats->ac_sched = tsk->policy;
58 stats->ac_pid = tsk->pid; 60 stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
59 rcu_read_lock(); 61 rcu_read_lock();
60 tcred = __task_cred(tsk); 62 tcred = __task_cred(tsk);
61 stats->ac_uid = tcred->uid; 63 stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
62 stats->ac_gid = tcred->gid; 64 stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
63 stats->ac_ppid = pid_alive(tsk) ? 65 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
65 rcu_read_unlock(); 67 rcu_read_unlock();
66 stats->ac_utime = cputime_to_usecs(tsk->utime); 68 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_usecs(tsk->stime); 69 stats->ac_stime = cputime_to_usecs(tsk->stime);
diff --git a/kernel/user.c b/kernel/user.c
index b815fefbe76f..33acb5e53a5f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
19 20
20/* 21/*
21 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -38,11 +39,20 @@ struct user_namespace init_user_ns = {
38 .count = 4294967295U, 39 .count = 4294967295U,
39 }, 40 },
40 }, 41 },
42 .projid_map = {
43 .nr_extents = 1,
44 .extent[0] = {
45 .first = 0,
46 .lower_first = 0,
47 .count = 4294967295U,
48 },
49 },
41 .kref = { 50 .kref = {
42 .refcount = ATOMIC_INIT(3), 51 .refcount = ATOMIC_INIT(3),
43 }, 52 },
44 .owner = GLOBAL_ROOT_UID, 53 .owner = GLOBAL_ROOT_UID,
45 .group = GLOBAL_ROOT_GID, 54 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO,
46}; 56};
47EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
48 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86602316422d..2b042c42fbc4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
12#include <linux/highuid.h> 13#include <linux/highuid.h>
13#include <linux/cred.h> 14#include <linux/cred.h>
14#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -19,12 +20,31 @@
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
21#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/projid.h>
22 24
23static struct kmem_cache *user_ns_cachep __read_mostly; 25static struct kmem_cache *user_ns_cachep __read_mostly;
24 26
25static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
26 struct uid_gid_map *map); 28 struct uid_gid_map *map);
27 29
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
31{
32 /* Start with the same capabilities as init but useless for doing
33 * anything as the capabilities are bound to the new user namespace.
34 */
35 cred->securebits = SECUREBITS_DEFAULT;
36 cred->cap_inheritable = CAP_EMPTY_SET;
37 cred->cap_permitted = CAP_FULL_SET;
38 cred->cap_effective = CAP_FULL_SET;
39 cred->cap_bset = CAP_FULL_SET;
40#ifdef CONFIG_KEYS
41 key_put(cred->request_key_auth);
42 cred->request_key_auth = NULL;
43#endif
44 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
45 cred->user_ns = user_ns;
46}
47
28/* 48/*
29 * Create a new user namespace, deriving the creator from the user in the 49 * Create a new user namespace, deriving the creator from the user in the
30 * passed credentials, and replacing that user with the new root user for the 50 * passed credentials, and replacing that user with the new root user for the
@@ -38,6 +58,7 @@ int create_user_ns(struct cred *new)
38 struct user_namespace *ns, *parent_ns = new->user_ns; 58 struct user_namespace *ns, *parent_ns = new->user_ns;
39 kuid_t owner = new->euid; 59 kuid_t owner = new->euid;
40 kgid_t group = new->egid; 60 kgid_t group = new->egid;
61 int ret;
41 62
42 /* The creator needs a mapping in the parent user namespace 63 /* The creator needs a mapping in the parent user namespace
43 * or else we won't be able to reasonably tell userspace who 64 * or else we won't be able to reasonably tell userspace who
@@ -51,38 +72,45 @@ int create_user_ns(struct cred *new)
51 if (!ns) 72 if (!ns)
52 return -ENOMEM; 73 return -ENOMEM;
53 74
75 ret = proc_alloc_inum(&ns->proc_inum);
76 if (ret) {
77 kmem_cache_free(user_ns_cachep, ns);
78 return ret;
79 }
80
54 kref_init(&ns->kref); 81 kref_init(&ns->kref);
82 /* Leave the new->user_ns reference with the new user namespace. */
55 ns->parent = parent_ns; 83 ns->parent = parent_ns;
56 ns->owner = owner; 84 ns->owner = owner;
57 ns->group = group; 85 ns->group = group;
58 86
59 /* Start with the same capabilities as init but useless for doing 87 set_cred_user_ns(new, ns);
60 * anything as the capabilities are bound to the new user namespace.
61 */
62 new->securebits = SECUREBITS_DEFAULT;
63 new->cap_inheritable = CAP_EMPTY_SET;
64 new->cap_permitted = CAP_FULL_SET;
65 new->cap_effective = CAP_FULL_SET;
66 new->cap_bset = CAP_FULL_SET;
67#ifdef CONFIG_KEYS
68 key_put(new->request_key_auth);
69 new->request_key_auth = NULL;
70#endif
71 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
72
73 /* Leave the new->user_ns reference with the new user namespace. */
74 /* Leave the reference to our user_ns with the new cred. */
75 new->user_ns = ns;
76 88
77 return 0; 89 return 0;
78} 90}
79 91
92int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
93{
94 struct cred *cred;
95
96 if (!(unshare_flags & CLONE_NEWUSER))
97 return 0;
98
99 cred = prepare_creds();
100 if (!cred)
101 return -ENOMEM;
102
103 *new_cred = cred;
104 return create_user_ns(cred);
105}
106
80void free_user_ns(struct kref *kref) 107void free_user_ns(struct kref *kref)
81{ 108{
82 struct user_namespace *parent, *ns = 109 struct user_namespace *parent, *ns =
83 container_of(kref, struct user_namespace, kref); 110 container_of(kref, struct user_namespace, kref);
84 111
85 parent = ns->parent; 112 parent = ns->parent;
113 proc_free_inum(ns->proc_inum);
86 kmem_cache_free(user_ns_cachep, ns); 114 kmem_cache_free(user_ns_cachep, ns);
87 put_user_ns(parent); 115 put_user_ns(parent);
88} 116}
@@ -295,6 +323,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
295} 323}
296EXPORT_SYMBOL(from_kgid_munged); 324EXPORT_SYMBOL(from_kgid_munged);
297 325
326/**
327 * make_kprojid - Map a user-namespace projid pair into a kprojid.
328 * @ns: User namespace that the projid is in
329 * @projid: Project identifier
330 *
331 * Maps a user-namespace uid pair into a kernel internal kuid,
332 * and returns that kuid.
333 *
334 * When there is no mapping defined for the user-namespace projid
335 * pair INVALID_PROJID is returned. Callers are expected to test
336 * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
337 * may be tested for using projid_valid().
338 */
339kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
340{
341 /* Map the uid to a global kernel uid */
342 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
343}
344EXPORT_SYMBOL(make_kprojid);
345
346/**
347 * from_kprojid - Create a projid from a kprojid user-namespace pair.
348 * @targ: The user namespace we want a projid in.
349 * @kprojid: The kernel internal project identifier to start with.
350 *
351 * Map @kprojid into the user-namespace specified by @targ and
352 * return the resulting projid.
353 *
354 * There is always a mapping into the initial user_namespace.
355 *
356 * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
357 */
358projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
359{
360 /* Map the uid from a global kernel uid */
361 return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
362}
363EXPORT_SYMBOL(from_kprojid);
364
365/**
366 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
367 * @targ: The user namespace we want a projid in.
368 * @kprojid: The kernel internal projid to start with.
369 *
370 * Map @kprojid into the user-namespace specified by @targ and
371 * return the resulting projid.
372 *
373 * There is always a mapping into the initial user_namespace.
374 *
375 * Unlike from_kprojid from_kprojid_munged never fails and always
376 * returns a valid projid. This makes from_kprojid_munged
377 * appropriate for use in syscalls like stat and where
378 * failing the system call and failing to provide a valid projid are
379 * not an options.
380 *
381 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
382 */
383projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
384{
385 projid_t projid;
386 projid = from_kprojid(targ, kprojid);
387
388 if (projid == (projid_t) -1)
389 projid = OVERFLOW_PROJID;
390 return projid;
391}
392EXPORT_SYMBOL(from_kprojid_munged);
393
394
298static int uid_m_show(struct seq_file *seq, void *v) 395static int uid_m_show(struct seq_file *seq, void *v)
299{ 396{
300 struct user_namespace *ns = seq->private; 397 struct user_namespace *ns = seq->private;
@@ -302,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
302 struct user_namespace *lower_ns; 399 struct user_namespace *lower_ns;
303 uid_t lower; 400 uid_t lower;
304 401
305 lower_ns = current_user_ns(); 402 lower_ns = seq_user_ns(seq);
306 if ((lower_ns == ns) && lower_ns->parent) 403 if ((lower_ns == ns) && lower_ns->parent)
307 lower_ns = lower_ns->parent; 404 lower_ns = lower_ns->parent;
308 405
@@ -323,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
323 struct user_namespace *lower_ns; 420 struct user_namespace *lower_ns;
324 gid_t lower; 421 gid_t lower;
325 422
326 lower_ns = current_user_ns(); 423 lower_ns = seq_user_ns(seq);
327 if ((lower_ns == ns) && lower_ns->parent) 424 if ((lower_ns == ns) && lower_ns->parent)
328 lower_ns = lower_ns->parent; 425 lower_ns = lower_ns->parent;
329 426
@@ -337,6 +434,27 @@ static int gid_m_show(struct seq_file *seq, void *v)
337 return 0; 434 return 0;
338} 435}
339 436
437static int projid_m_show(struct seq_file *seq, void *v)
438{
439 struct user_namespace *ns = seq->private;
440 struct uid_gid_extent *extent = v;
441 struct user_namespace *lower_ns;
442 projid_t lower;
443
444 lower_ns = seq_user_ns(seq);
445 if ((lower_ns == ns) && lower_ns->parent)
446 lower_ns = lower_ns->parent;
447
448 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
449
450 seq_printf(seq, "%10u %10u %10u\n",
451 extent->first,
452 lower,
453 extent->count);
454
455 return 0;
456}
457
340static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 458static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
341{ 459{
342 struct uid_gid_extent *extent = NULL; 460 struct uid_gid_extent *extent = NULL;
@@ -362,6 +480,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
362 return m_start(seq, ppos, &ns->gid_map); 480 return m_start(seq, ppos, &ns->gid_map);
363} 481}
364 482
483static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
484{
485 struct user_namespace *ns = seq->private;
486
487 return m_start(seq, ppos, &ns->projid_map);
488}
489
365static void *m_next(struct seq_file *seq, void *v, loff_t *pos) 490static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
366{ 491{
367 (*pos)++; 492 (*pos)++;
@@ -387,6 +512,13 @@ struct seq_operations proc_gid_seq_operations = {
387 .show = gid_m_show, 512 .show = gid_m_show,
388}; 513};
389 514
515struct seq_operations proc_projid_seq_operations = {
516 .start = projid_m_start,
517 .stop = m_stop,
518 .next = m_next,
519 .show = projid_m_show,
520};
521
390static DEFINE_MUTEX(id_map_mutex); 522static DEFINE_MUTEX(id_map_mutex);
391 523
392static ssize_t map_write(struct file *file, const char __user *buf, 524static ssize_t map_write(struct file *file, const char __user *buf,
@@ -434,7 +566,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
434 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 566 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
435 * over the user namespace in order to set the id mapping. 567 * over the user namespace in order to set the id mapping.
436 */ 568 */
437 if (!ns_capable(ns, cap_setid)) 569 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
438 goto out; 570 goto out;
439 571
440 /* Get a buffer */ 572 /* Get a buffer */
@@ -564,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
564{ 696{
565 struct seq_file *seq = file->private_data; 697 struct seq_file *seq = file->private_data;
566 struct user_namespace *ns = seq->private; 698 struct user_namespace *ns = seq->private;
699 struct user_namespace *seq_ns = seq_user_ns(seq);
567 700
568 if (!ns->parent) 701 if (!ns->parent)
569 return -EPERM; 702 return -EPERM;
570 703
704 if ((seq_ns != ns) && (seq_ns != ns->parent))
705 return -EPERM;
706
571 return map_write(file, buf, size, ppos, CAP_SETUID, 707 return map_write(file, buf, size, ppos, CAP_SETUID,
572 &ns->uid_map, &ns->parent->uid_map); 708 &ns->uid_map, &ns->parent->uid_map);
573} 709}
@@ -576,17 +712,57 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
576{ 712{
577 struct seq_file *seq = file->private_data; 713 struct seq_file *seq = file->private_data;
578 struct user_namespace *ns = seq->private; 714 struct user_namespace *ns = seq->private;
715 struct user_namespace *seq_ns = seq_user_ns(seq);
579 716
580 if (!ns->parent) 717 if (!ns->parent)
581 return -EPERM; 718 return -EPERM;
582 719
720 if ((seq_ns != ns) && (seq_ns != ns->parent))
721 return -EPERM;
722
583 return map_write(file, buf, size, ppos, CAP_SETGID, 723 return map_write(file, buf, size, ppos, CAP_SETGID,
584 &ns->gid_map, &ns->parent->gid_map); 724 &ns->gid_map, &ns->parent->gid_map);
585} 725}
586 726
727ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
728{
729 struct seq_file *seq = file->private_data;
730 struct user_namespace *ns = seq->private;
731 struct user_namespace *seq_ns = seq_user_ns(seq);
732
733 if (!ns->parent)
734 return -EPERM;
735
736 if ((seq_ns != ns) && (seq_ns != ns->parent))
737 return -EPERM;
738
739 /* Anyone can set any valid project id no capability needed */
740 return map_write(file, buf, size, ppos, -1,
741 &ns->projid_map, &ns->parent->projid_map);
742}
743
587static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 744static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
588 struct uid_gid_map *new_map) 745 struct uid_gid_map *new_map)
589{ 746{
747 /* Allow mapping to your own filesystem ids */
748 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
749 u32 id = new_map->extent[0].lower_first;
750 if (cap_setid == CAP_SETUID) {
751 kuid_t uid = make_kuid(ns->parent, id);
752 if (uid_eq(uid, current_fsuid()))
753 return true;
754 }
755 else if (cap_setid == CAP_SETGID) {
756 kgid_t gid = make_kgid(ns->parent, id);
757 if (gid_eq(gid, current_fsgid()))
758 return true;
759 }
760 }
761
762 /* Allow anyone to set a mapping that doesn't require privilege */
763 if (!cap_valid(cap_setid))
764 return true;
765
590 /* Allow the specified ids if we have the appropriate capability 766 /* Allow the specified ids if we have the appropriate capability
591 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 767 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
592 */ 768 */
@@ -596,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
596 return false; 772 return false;
597} 773}
598 774
775static void *userns_get(struct task_struct *task)
776{
777 struct user_namespace *user_ns;
778
779 rcu_read_lock();
780 user_ns = get_user_ns(__task_cred(task)->user_ns);
781 rcu_read_unlock();
782
783 return user_ns;
784}
785
786static void userns_put(void *ns)
787{
788 put_user_ns(ns);
789}
790
791static int userns_install(struct nsproxy *nsproxy, void *ns)
792{
793 struct user_namespace *user_ns = ns;
794 struct cred *cred;
795
796 /* Don't allow gaining capabilities by reentering
797 * the same user namespace.
798 */
799 if (user_ns == current_user_ns())
800 return -EINVAL;
801
802 /* Threaded processes may not enter a different user namespace */
803 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL;
805
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM;
808
809 cred = prepare_creds();
810 if (!cred)
811 return -ENOMEM;
812
813 put_user_ns(cred->user_ns);
814 set_cred_user_ns(cred, get_user_ns(user_ns));
815
816 return commit_creds(cred);
817}
818
819static unsigned int userns_inum(void *ns)
820{
821 struct user_namespace *user_ns = ns;
822 return user_ns->proc_inum;
823}
824
825const struct proc_ns_operations userns_operations = {
826 .name = "user",
827 .type = CLONE_NEWUSER,
828 .get = userns_get,
829 .put = userns_put,
830 .install = userns_install,
831 .inum = userns_inum,
832};
833
599static __init int user_namespaces_init(void) 834static __init int user_namespaces_init(void)
600{ 835{
601 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 836 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..08b197e8c485 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
39 40
40 ns = create_uts_ns(); 41 ns = create_uts_ns();
41 if (!ns) 42 if (!ns)
42 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
43 44
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
44 down_read(&uts_sem); 51 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 ns->user_ns = get_user_ns(user_ns);
47 up_read(&uts_sem); 54 up_read(&uts_sem);
48 return ns; 55 return ns;
49} 56}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
55 * versa. 62 * versa.
56 */ 63 */
57struct uts_namespace *copy_utsname(unsigned long flags, 64struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk) 65 struct user_namespace *user_ns, struct uts_namespace *old_ns)
59{ 66{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 67 struct uts_namespace *new_ns;
62 68
63 BUG_ON(!old_ns); 69 BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
66 if (!(flags & CLONE_NEWUTS)) 72 if (!(flags & CLONE_NEWUTS))
67 return old_ns; 73 return old_ns;
68 74
69 new_ns = clone_uts_ns(tsk, old_ns); 75 new_ns = clone_uts_ns(user_ns, old_ns);
70 76
71 put_uts_ns(old_ns); 77 put_uts_ns(old_ns);
72 return new_ns; 78 return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
78 84
79 ns = container_of(kref, struct uts_namespace, kref); 85 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns); 86 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
81 kfree(ns); 88 kfree(ns);
82} 89}
83 90
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
102 put_uts_ns(ns); 109 put_uts_ns(ns);
103} 110}
104 111
105static int utsns_install(struct nsproxy *nsproxy, void *ns) 112static int utsns_install(struct nsproxy *nsproxy, void *new)
106{ 113{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN))
118 return -EPERM;
119
107 get_uts_ns(ns); 120 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns); 121 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns; 122 nsproxy->uts_ns = ns;
110 return 0; 123 return 0;
111} 124}
112 125
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
113const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
114 .name = "uts", 134 .name = "uts",
115 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
116 .get = utsns_get, 136 .get = utsns_get,
117 .put = utsns_put, 137 .put = utsns_put,
118 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
119}; 140};
120
diff --git a/kernel/wait.c b/kernel/wait.c
index 7fdd9eaca2c3..6698e0c04ead 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic waiting primitives. 2 * Generic waiting primitives.
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/export.h> 7#include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..75a2ab3d0b02 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h>
25 26
26#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
27#include <linux/kvm_para.h> 28#include <linux/kvm_para.h>
@@ -29,16 +30,19 @@
29 30
30int watchdog_enabled = 1; 31int watchdog_enabled = 1;
31int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled;
34static u64 __read_mostly sample_period;
32 35
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 36static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 37static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 38static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync); 39static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn); 40static DEFINE_PER_CPU(bool, soft_watchdog_warn);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR 43#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn); 44static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 45static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 46static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 47static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 48#endif
@@ -113,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu)
113 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
114} 118}
115 119
116static unsigned long get_sample_period(void) 120static void set_sample_period(void)
117{ 121{
118 /* 122 /*
119 * convert watchdog_thresh from seconds to ns 123 * convert watchdog_thresh from seconds to ns
@@ -122,7 +126,7 @@ static unsigned long get_sample_period(void)
122 * and hard thresholds) to increment before the 126 * and hard thresholds) to increment before the
123 * hardlockup detector generates a warning 127 * hardlockup detector generates a warning
124 */ 128 */
125 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 129 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
126} 130}
127 131
128/* Commands for resetting the watchdog */ 132/* Commands for resetting the watchdog */
@@ -248,13 +252,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
248 __this_cpu_write(hard_watchdog_warn, false); 252 __this_cpu_write(hard_watchdog_warn, false);
249 return; 253 return;
250} 254}
255#endif /* CONFIG_HARDLOCKUP_DETECTOR */
256
251static void watchdog_interrupt_count(void) 257static void watchdog_interrupt_count(void)
252{ 258{
253 __this_cpu_inc(hrtimer_interrupts); 259 __this_cpu_inc(hrtimer_interrupts);
254} 260}
255#else 261
256static inline void watchdog_interrupt_count(void) { return; } 262static int watchdog_nmi_enable(unsigned int cpu);
257#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 263static void watchdog_nmi_disable(unsigned int cpu);
258 264
259/* watchdog kicker functions */ 265/* watchdog kicker functions */
260static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 266static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -270,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
270 wake_up_process(__this_cpu_read(softlockup_watchdog)); 276 wake_up_process(__this_cpu_read(softlockup_watchdog));
271 277
272 /* .. and repeat */ 278 /* .. and repeat */
273 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 279 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
274 280
275 if (touch_ts == 0) { 281 if (touch_ts == 0) {
276 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 282 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -327,49 +333,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 return HRTIMER_RESTART; 333 return HRTIMER_RESTART;
328} 334}
329 335
336static void watchdog_set_prio(unsigned int policy, unsigned int prio)
337{
338 struct sched_param param = { .sched_priority = prio };
330 339
331/* 340 sched_setscheduler(current, policy, &param);
332 * The watchdog thread - touches the timestamp. 341}
333 */ 342
334static int watchdog(void *unused) 343static void watchdog_enable(unsigned int cpu)
335{ 344{
336 struct sched_param param = { .sched_priority = 0 };
337 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 345 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
338 346
339 /* initialize timestamp */
340 __touch_watchdog();
341
342 /* kick off the timer for the hardlockup detector */ 347 /* kick off the timer for the hardlockup detector */
348 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
349 hrtimer->function = watchdog_timer_fn;
350
351 if (!watchdog_enabled) {
352 kthread_park(current);
353 return;
354 }
355
356 /* Enable the perf event */
357 watchdog_nmi_enable(cpu);
358
343 /* done here because hrtimer_start can only pin to smp_processor_id() */ 359 /* done here because hrtimer_start can only pin to smp_processor_id() */
344 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 360 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
345 HRTIMER_MODE_REL_PINNED); 361 HRTIMER_MODE_REL_PINNED);
346 362
347 set_current_state(TASK_INTERRUPTIBLE); 363 /* initialize timestamp */
348 /* 364 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
349 * Run briefly (kicked by the hrtimer callback function) once every 365 __touch_watchdog();
350 * get_sample_period() seconds (4 seconds by default) to reset the 366}
351 * softlockup timestamp. If this gets delayed for more than
352 * 2*watchdog_thresh seconds then the debug-printout triggers in
353 * watchdog_timer_fn().
354 */
355 while (!kthread_should_stop()) {
356 __touch_watchdog();
357 schedule();
358 367
359 if (kthread_should_stop()) 368static void watchdog_disable(unsigned int cpu)
360 break; 369{
370 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
361 371
362 set_current_state(TASK_INTERRUPTIBLE); 372 watchdog_set_prio(SCHED_NORMAL, 0);
363 } 373 hrtimer_cancel(hrtimer);
364 /* 374 /* disable the perf event */
365 * Drop the policy/priority elevation during thread exit to avoid a 375 watchdog_nmi_disable(cpu);
366 * scheduling latency spike. 376}
367 */ 377
368 __set_current_state(TASK_RUNNING); 378static int watchdog_should_run(unsigned int cpu)
369 sched_setscheduler(current, SCHED_NORMAL, &param); 379{
370 return 0; 380 return __this_cpu_read(hrtimer_interrupts) !=
381 __this_cpu_read(soft_lockup_hrtimer_cnt);
371} 382}
372 383
384/*
385 * The watchdog thread function - touches the timestamp.
386 *
387 * It only runs once every sample_period seconds (4 seconds by
388 * default) to reset the softlockup timestamp. If this gets delayed
389 * for more than 2*watchdog_thresh seconds then the debug-printout
390 * triggers in watchdog_timer_fn().
391 */
392static void watchdog(unsigned int cpu)
393{
394 __this_cpu_write(soft_lockup_hrtimer_cnt,
395 __this_cpu_read(hrtimer_interrupts));
396 __touch_watchdog();
397}
373 398
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 399#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/* 400/*
@@ -379,7 +404,7 @@ static int watchdog(void *unused)
379 */ 404 */
380static unsigned long cpu0_err; 405static unsigned long cpu0_err;
381 406
382static int watchdog_nmi_enable(int cpu) 407static int watchdog_nmi_enable(unsigned int cpu)
383{ 408{
384 struct perf_event_attr *wd_attr; 409 struct perf_event_attr *wd_attr;
385 struct perf_event *event = per_cpu(watchdog_ev, cpu); 410 struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +458,7 @@ out:
433 return 0; 458 return 0;
434} 459}
435 460
436static void watchdog_nmi_disable(int cpu) 461static void watchdog_nmi_disable(unsigned int cpu)
437{ 462{
438 struct perf_event *event = per_cpu(watchdog_ev, cpu); 463 struct perf_event *event = per_cpu(watchdog_ev, cpu);
439 464
@@ -447,107 +472,35 @@ static void watchdog_nmi_disable(int cpu)
447 return; 472 return;
448} 473}
449#else 474#else
450static int watchdog_nmi_enable(int cpu) { return 0; } 475static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
451static void watchdog_nmi_disable(int cpu) { return; } 476static void watchdog_nmi_disable(unsigned int cpu) { return; }
452#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 477#endif /* CONFIG_HARDLOCKUP_DETECTOR */
453 478
454/* prepare/enable/disable routines */ 479/* prepare/enable/disable routines */
455static void watchdog_prepare_cpu(int cpu)
456{
457 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
458
459 WARN_ON(per_cpu(softlockup_watchdog, cpu));
460 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
461 hrtimer->function = watchdog_timer_fn;
462}
463
464static int watchdog_enable(int cpu)
465{
466 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
467 int err = 0;
468
469 /* enable the perf event */
470 err = watchdog_nmi_enable(cpu);
471
472 /* Regardless of err above, fall through and start softlockup */
473
474 /* create the watchdog thread */
475 if (!p) {
476 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
477 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
478 if (IS_ERR(p)) {
479 pr_err("softlockup watchdog for %i failed\n", cpu);
480 if (!err) {
481 /* if hardlockup hasn't already set this */
482 err = PTR_ERR(p);
483 /* and disable the perf event */
484 watchdog_nmi_disable(cpu);
485 }
486 goto out;
487 }
488 sched_setscheduler(p, SCHED_FIFO, &param);
489 kthread_bind(p, cpu);
490 per_cpu(watchdog_touch_ts, cpu) = 0;
491 per_cpu(softlockup_watchdog, cpu) = p;
492 wake_up_process(p);
493 }
494
495out:
496 return err;
497}
498
499static void watchdog_disable(int cpu)
500{
501 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
502 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
503
504 /*
505 * cancel the timer first to stop incrementing the stats
506 * and waking up the kthread
507 */
508 hrtimer_cancel(hrtimer);
509
510 /* disable the perf event */
511 watchdog_nmi_disable(cpu);
512
513 /* stop the watchdog thread */
514 if (p) {
515 per_cpu(softlockup_watchdog, cpu) = NULL;
516 kthread_stop(p);
517 }
518}
519
520/* sysctl functions */ 480/* sysctl functions */
521#ifdef CONFIG_SYSCTL 481#ifdef CONFIG_SYSCTL
522static void watchdog_enable_all_cpus(void) 482static void watchdog_enable_all_cpus(void)
523{ 483{
524 int cpu; 484 unsigned int cpu;
525
526 watchdog_enabled = 0;
527
528 for_each_online_cpu(cpu)
529 if (!watchdog_enable(cpu))
530 /* if any cpu succeeds, watchdog is considered
531 enabled for the system */
532 watchdog_enabled = 1;
533
534 if (!watchdog_enabled)
535 pr_err("failed to be enabled on some cpus\n");
536 485
486 if (watchdog_disabled) {
487 watchdog_disabled = 0;
488 for_each_online_cpu(cpu)
489 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
490 }
537} 491}
538 492
539static void watchdog_disable_all_cpus(void) 493static void watchdog_disable_all_cpus(void)
540{ 494{
541 int cpu; 495 unsigned int cpu;
542
543 for_each_online_cpu(cpu)
544 watchdog_disable(cpu);
545 496
546 /* if all watchdogs are disabled, then they are disabled for the system */ 497 if (!watchdog_disabled) {
547 watchdog_enabled = 0; 498 watchdog_disabled = 1;
499 for_each_online_cpu(cpu)
500 kthread_park(per_cpu(softlockup_watchdog, cpu));
501 }
548} 502}
549 503
550
551/* 504/*
552 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 505 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
553 */ 506 */
@@ -557,73 +510,38 @@ int proc_dowatchdog(struct ctl_table *table, int write,
557{ 510{
558 int ret; 511 int ret;
559 512
513 if (watchdog_disabled < 0)
514 return -ENODEV;
515
560 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 516 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
561 if (ret || !write) 517 if (ret || !write)
562 goto out; 518 return ret;
563 519
520 set_sample_period();
564 if (watchdog_enabled && watchdog_thresh) 521 if (watchdog_enabled && watchdog_thresh)
565 watchdog_enable_all_cpus(); 522 watchdog_enable_all_cpus();
566 else 523 else
567 watchdog_disable_all_cpus(); 524 watchdog_disable_all_cpus();
568 525
569out:
570 return ret; 526 return ret;
571} 527}
572#endif /* CONFIG_SYSCTL */ 528#endif /* CONFIG_SYSCTL */
573 529
574 530static struct smp_hotplug_thread watchdog_threads = {
575/* 531 .store = &softlockup_watchdog,
576 * Create/destroy watchdog threads as CPUs come and go: 532 .thread_should_run = watchdog_should_run,
577 */ 533 .thread_fn = watchdog,
578static int __cpuinit 534 .thread_comm = "watchdog/%u",
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 535 .setup = watchdog_enable,
580{ 536 .park = watchdog_disable,
581 int hotcpu = (unsigned long)hcpu; 537 .unpark = watchdog_enable,
582
583 switch (action) {
584 case CPU_UP_PREPARE:
585 case CPU_UP_PREPARE_FROZEN:
586 watchdog_prepare_cpu(hotcpu);
587 break;
588 case CPU_ONLINE:
589 case CPU_ONLINE_FROZEN:
590 if (watchdog_enabled)
591 watchdog_enable(hotcpu);
592 break;
593#ifdef CONFIG_HOTPLUG_CPU
594 case CPU_UP_CANCELED:
595 case CPU_UP_CANCELED_FROZEN:
596 watchdog_disable(hotcpu);
597 break;
598 case CPU_DEAD:
599 case CPU_DEAD_FROZEN:
600 watchdog_disable(hotcpu);
601 break;
602#endif /* CONFIG_HOTPLUG_CPU */
603 }
604
605 /*
606 * hardlockup and softlockup are not important enough
607 * to block cpu bring up. Just always succeed and
608 * rely on printk output to flag problems.
609 */
610 return NOTIFY_OK;
611}
612
613static struct notifier_block __cpuinitdata cpu_nfb = {
614 .notifier_call = cpu_callback
615}; 538};
616 539
617void __init lockup_detector_init(void) 540void __init lockup_detector_init(void)
618{ 541{
619 void *cpu = (void *)(long)smp_processor_id(); 542 set_sample_period();
620 int err; 543 if (smpboot_register_percpu_thread(&watchdog_threads)) {
621 544 pr_err("Failed to create watchdog threads, disabled\n");
622 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 545 watchdog_disabled = -ENODEV;
623 WARN_ON(notifier_to_errno(err)); 546 }
624
625 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
626 register_cpu_notifier(&cpu_nfb);
627
628 return;
629} 547}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c5a79e2134c..fbc6576a83c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,7 +58,7 @@ enum {
58 * be executing on any CPU. The gcwq behaves as an unbound one. 58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 * 59 *
60 * Note that DISASSOCIATED can be flipped only while holding 60 * Note that DISASSOCIATED can be flipped only while holding
61 * managership of all pools on the gcwq to avoid changing binding 61 * assoc_mutex of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress. 62 * state while create_worker() is in progress.
63 */ 63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ 64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
@@ -73,11 +73,10 @@ enum {
73 WORKER_DIE = 1 << 1, /* die die die */ 73 WORKER_DIE = 1 << 1, /* die die die */
74 WORKER_IDLE = 1 << 2, /* is idle */ 74 WORKER_IDLE = 1 << 2, /* is idle */
75 WORKER_PREP = 1 << 3, /* preparing to run works */ 75 WORKER_PREP = 1 << 3, /* preparing to run works */
76 WORKER_REBIND = 1 << 5, /* mom is home, come back */
77 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
78 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
79 78
80 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
81 WORKER_CPU_INTENSIVE, 80 WORKER_CPU_INTENSIVE,
82 81
83 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
@@ -126,7 +125,6 @@ enum {
126 125
127struct global_cwq; 126struct global_cwq;
128struct worker_pool; 127struct worker_pool;
129struct idle_rebind;
130 128
131/* 129/*
132 * The poor guys doing the actual heavy lifting. All on-duty workers 130 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -150,7 +148,6 @@ struct worker {
150 int id; /* I: worker id */ 148 int id; /* I: worker id */
151 149
152 /* for rebinding worker to CPU */ 150 /* for rebinding worker to CPU */
153 struct idle_rebind *idle_rebind; /* L: for idle worker */
154 struct work_struct rebind_work; /* L: for busy worker */ 151 struct work_struct rebind_work; /* L: for busy worker */
155}; 152};
156 153
@@ -160,13 +157,15 @@ struct worker_pool {
160 157
161 struct list_head worklist; /* L: list of pending works */ 158 struct list_head worklist; /* L: list of pending works */
162 int nr_workers; /* L: total number of workers */ 159 int nr_workers; /* L: total number of workers */
160
161 /* nr_idle includes the ones off idle_list for rebinding */
163 int nr_idle; /* L: currently idle ones */ 162 int nr_idle; /* L: currently idle ones */
164 163
165 struct list_head idle_list; /* X: list of idle workers */ 164 struct list_head idle_list; /* X: list of idle workers */
166 struct timer_list idle_timer; /* L: worker idle timeout */ 165 struct timer_list idle_timer; /* L: worker idle timeout */
167 struct timer_list mayday_timer; /* L: SOS timer for workers */ 166 struct timer_list mayday_timer; /* L: SOS timer for workers */
168 167
169 struct mutex manager_mutex; /* mutex manager should hold */ 168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
170 struct ida worker_ida; /* L: for worker IDs */ 169 struct ida worker_ida; /* L: for worker IDs */
171}; 170};
172 171
@@ -184,9 +183,8 @@ struct global_cwq {
184 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
185 /* L: hash of busy workers */ 184 /* L: hash of busy workers */
186 185
187 struct worker_pool pools[2]; /* normal and highpri pools */ 186 struct worker_pool pools[NR_WORKER_POOLS];
188 187 /* normal and highpri pools */
189 wait_queue_head_t rebind_hold; /* rebind hold wait */
190} ____cacheline_aligned_in_smp; 188} ____cacheline_aligned_in_smp;
191 189
192/* 190/*
@@ -269,17 +267,15 @@ struct workqueue_struct {
269}; 267};
270 268
271struct workqueue_struct *system_wq __read_mostly; 269struct workqueue_struct *system_wq __read_mostly;
272struct workqueue_struct *system_long_wq __read_mostly;
273struct workqueue_struct *system_nrt_wq __read_mostly;
274struct workqueue_struct *system_unbound_wq __read_mostly;
275struct workqueue_struct *system_freezable_wq __read_mostly;
276struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
277EXPORT_SYMBOL_GPL(system_wq); 270EXPORT_SYMBOL_GPL(system_wq);
271struct workqueue_struct *system_highpri_wq __read_mostly;
272EXPORT_SYMBOL_GPL(system_highpri_wq);
273struct workqueue_struct *system_long_wq __read_mostly;
278EXPORT_SYMBOL_GPL(system_long_wq); 274EXPORT_SYMBOL_GPL(system_long_wq);
279EXPORT_SYMBOL_GPL(system_nrt_wq); 275struct workqueue_struct *system_unbound_wq __read_mostly;
280EXPORT_SYMBOL_GPL(system_unbound_wq); 276EXPORT_SYMBOL_GPL(system_unbound_wq);
277struct workqueue_struct *system_freezable_wq __read_mostly;
281EXPORT_SYMBOL_GPL(system_freezable_wq); 278EXPORT_SYMBOL_GPL(system_freezable_wq);
282EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
283 279
284#define CREATE_TRACE_POINTS 280#define CREATE_TRACE_POINTS
285#include <trace/events/workqueue.h> 281#include <trace/events/workqueue.h>
@@ -534,18 +530,24 @@ static int work_next_color(int color)
534} 530}
535 531
536/* 532/*
537 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the 533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
538 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is 534 * contain the pointer to the queued cwq. Once execution starts, the flag
539 * cleared and the work data contains the cpu number it was last on. 535 * is cleared and the high bits contain OFFQ flags and CPU number.
540 * 536 *
541 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the 537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
542 * cwq, cpu or clear work->data. These functions should only be 538 * and clear_work_data() can be used to set the cwq, cpu or clear
543 * called while the work is owned - ie. while the PENDING bit is set. 539 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set.
544 * 541 *
545 * get_work_[g]cwq() can be used to obtain the gcwq or cwq 542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
546 * corresponding to a work. gcwq is available once the work has been 543 * a work. gcwq is available once the work has been queued anywhere after
547 * queued anywhere after initialization. cwq is available only from 544 * initialization until it is sync canceled. cwq is available only while
548 * queueing until execution starts. 545 * the work item is queued.
546 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set
549 * but stay off timer and worklist for arbitrarily long and nobody should
550 * try to steal the PENDING bit.
549 */ 551 */
550static inline void set_work_data(struct work_struct *work, unsigned long data, 552static inline void set_work_data(struct work_struct *work, unsigned long data,
551 unsigned long flags) 553 unsigned long flags)
@@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work,
562 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
563} 565}
564 566
565static void set_work_cpu(struct work_struct *work, unsigned int cpu) 567static void set_work_cpu_and_clear_pending(struct work_struct *work,
568 unsigned int cpu)
566{ 569{
567 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); 570 /*
571 * The following wmb is paired with the implied mb in
572 * test_and_set_bit(PENDING) and ensures all updates to @work made
573 * here are visible to and precede any updates by the next PENDING
574 * owner.
575 */
576 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
568} 578}
569 579
570static void clear_work_data(struct work_struct *work) 580static void clear_work_data(struct work_struct *work)
571{ 581{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */
572 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 583 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
573} 584}
574 585
@@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
591 return ((struct cpu_workqueue_struct *) 602 return ((struct cpu_workqueue_struct *)
592 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
593 604
594 cpu = data >> WORK_STRUCT_FLAG_BITS; 605 cpu = data >> WORK_OFFQ_CPU_SHIFT;
595 if (cpu == WORK_CPU_NONE) 606 if (cpu == WORK_CPU_NONE)
596 return NULL; 607 return NULL;
597 608
@@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
599 return get_gcwq(cpu); 610 return get_gcwq(cpu);
600} 611}
601 612
613static void mark_work_canceling(struct work_struct *work)
614{
615 struct global_cwq *gcwq = get_work_gcwq(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
619 WORK_STRUCT_PENDING);
620}
621
622static bool work_is_canceling(struct work_struct *work)
623{
624 unsigned long data = atomic_long_read(&work->data);
625
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
627}
628
602/* 629/*
603 * Policy functions. These define the policies on how the global worker 630 * Policy functions. These define the policies on how the global worker
604 * pools are managed. Unless noted otherwise, these functions assume that 631 * pools are managed. Unless noted otherwise, these functions assume that
@@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool)
657 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 684 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
658 int nr_busy = pool->nr_workers - nr_idle; 685 int nr_busy = pool->nr_workers - nr_idle;
659 686
687 /*
688 * nr_idle and idle_list may disagree if idle rebinding is in
689 * progress. Never return %true if idle_list is empty.
690 */
691 if (list_empty(&pool->idle_list))
692 return false;
693
660 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 694 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
661} 695}
662 696
@@ -705,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
705{ 739{
706 struct worker *worker = kthread_data(task); 740 struct worker *worker = kthread_data(task);
707 741
708 if (!(worker->flags & WORKER_NOT_RUNNING)) 742 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
709 atomic_inc(get_pool_nr_running(worker->pool)); 744 atomic_inc(get_pool_nr_running(worker->pool));
745 }
710} 746}
711 747
712/** 748/**
@@ -903,6 +939,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
903} 939}
904 940
905/** 941/**
942 * move_linked_works - move linked works to a list
943 * @work: start of series of works to be scheduled
944 * @head: target list to append @work to
945 * @nextp: out paramter for nested worklist walking
946 *
947 * Schedule linked works starting from @work to @head. Work series to
948 * be scheduled starts at @work and includes any consecutive work with
949 * WORK_STRUCT_LINKED set in its predecessor.
950 *
951 * If @nextp is not NULL, it's updated to point to the next work of
952 * the last scheduled work. This allows move_linked_works() to be
953 * nested inside outer list_for_each_entry_safe().
954 *
955 * CONTEXT:
956 * spin_lock_irq(gcwq->lock).
957 */
958static void move_linked_works(struct work_struct *work, struct list_head *head,
959 struct work_struct **nextp)
960{
961 struct work_struct *n;
962
963 /*
964 * Linked worklist will always end before the end of the list,
965 * use NULL for list head.
966 */
967 list_for_each_entry_safe_from(work, n, NULL, entry) {
968 list_move_tail(&work->entry, head);
969 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
970 break;
971 }
972
973 /*
974 * If we're already inside safe list traversal and have moved
975 * multiple works to the scheduled queue, the next position
976 * needs to be updated.
977 */
978 if (nextp)
979 *nextp = n;
980}
981
982static void cwq_activate_delayed_work(struct work_struct *work)
983{
984 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
985
986 trace_workqueue_activate_work(work);
987 move_linked_works(work, &cwq->pool->worklist, NULL);
988 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
989 cwq->nr_active++;
990}
991
992static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
993{
994 struct work_struct *work = list_first_entry(&cwq->delayed_works,
995 struct work_struct, entry);
996
997 cwq_activate_delayed_work(work);
998}
999
1000/**
1001 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1002 * @cwq: cwq of interest
1003 * @color: color of work which left the queue
1004 *
1005 * A work either has completed or is removed from pending queue,
1006 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1007 *
1008 * CONTEXT:
1009 * spin_lock_irq(gcwq->lock).
1010 */
1011static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1012{
1013 /* ignore uncolored works */
1014 if (color == WORK_NO_COLOR)
1015 return;
1016
1017 cwq->nr_in_flight[color]--;
1018
1019 cwq->nr_active--;
1020 if (!list_empty(&cwq->delayed_works)) {
1021 /* one down, submit a delayed one */
1022 if (cwq->nr_active < cwq->max_active)
1023 cwq_activate_first_delayed(cwq);
1024 }
1025
1026 /* is flush in progress and are we at the flushing tip? */
1027 if (likely(cwq->flush_color != color))
1028 return;
1029
1030 /* are there still in-flight works? */
1031 if (cwq->nr_in_flight[color])
1032 return;
1033
1034 /* this cwq is done, clear flush_color */
1035 cwq->flush_color = -1;
1036
1037 /*
1038 * If this was the last cwq, wake up the first flusher. It
1039 * will handle the rest.
1040 */
1041 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1042 complete(&cwq->wq->first_flusher->done);
1043}
1044
1045/**
1046 * try_to_grab_pending - steal work item from worklist and disable irq
1047 * @work: work item to steal
1048 * @is_dwork: @work is a delayed_work
1049 * @flags: place to store irq state
1050 *
1051 * Try to grab PENDING bit of @work. This function can handle @work in any
1052 * stable state - idle, on timer or on worklist. Return values are
1053 *
1054 * 1 if @work was pending and we successfully stole PENDING
1055 * 0 if @work was idle and we claimed PENDING
1056 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
1057 * -ENOENT if someone else is canceling @work, this state may persist
1058 * for arbitrarily long
1059 *
1060 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
1061 * interrupted while holding PENDING and @work off queue, irq must be
1062 * disabled on entry. This, combined with delayed_work->timer being
1063 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1064 *
1065 * On successful return, >= 0, irq is disabled and the caller is
1066 * responsible for releasing it using local_irq_restore(*@flags).
1067 *
1068 * This function is safe to call from any context including IRQ handler.
1069 */
1070static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1071 unsigned long *flags)
1072{
1073 struct global_cwq *gcwq;
1074
1075 local_irq_save(*flags);
1076
1077 /* try to steal the timer if it exists */
1078 if (is_dwork) {
1079 struct delayed_work *dwork = to_delayed_work(work);
1080
1081 /*
1082 * dwork->timer is irqsafe. If del_timer() fails, it's
1083 * guaranteed that the timer is not queued anywhere and not
1084 * running on the local CPU.
1085 */
1086 if (likely(del_timer(&dwork->timer)))
1087 return 1;
1088 }
1089
1090 /* try to claim PENDING the normal way */
1091 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1092 return 0;
1093
1094 /*
1095 * The queueing is in progress, or it is already queued. Try to
1096 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1097 */
1098 gcwq = get_work_gcwq(work);
1099 if (!gcwq)
1100 goto fail;
1101
1102 spin_lock(&gcwq->lock);
1103 if (!list_empty(&work->entry)) {
1104 /*
1105 * This work is queued, but perhaps we locked the wrong gcwq.
1106 * In that case we must see the new value after rmb(), see
1107 * insert_work()->wmb().
1108 */
1109 smp_rmb();
1110 if (gcwq == get_work_gcwq(work)) {
1111 debug_work_deactivate(work);
1112
1113 /*
1114 * A delayed work item cannot be grabbed directly
1115 * because it might have linked NO_COLOR work items
1116 * which, if left on the delayed_list, will confuse
1117 * cwq->nr_active management later on and cause
1118 * stall. Make sure the work item is activated
1119 * before grabbing.
1120 */
1121 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1122 cwq_activate_delayed_work(work);
1123
1124 list_del_init(&work->entry);
1125 cwq_dec_nr_in_flight(get_work_cwq(work),
1126 get_work_color(work));
1127
1128 spin_unlock(&gcwq->lock);
1129 return 1;
1130 }
1131 }
1132 spin_unlock(&gcwq->lock);
1133fail:
1134 local_irq_restore(*flags);
1135 if (work_is_canceling(work))
1136 return -ENOENT;
1137 cpu_relax();
1138 return -EAGAIN;
1139}
1140
1141/**
906 * insert_work - insert a work into gcwq 1142 * insert_work - insert a work into gcwq
907 * @cwq: cwq @work belongs to 1143 * @cwq: cwq @work belongs to
908 * @work: work to insert 1144 * @work: work to insert
@@ -982,7 +1218,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
982 struct cpu_workqueue_struct *cwq; 1218 struct cpu_workqueue_struct *cwq;
983 struct list_head *worklist; 1219 struct list_head *worklist;
984 unsigned int work_flags; 1220 unsigned int work_flags;
985 unsigned long flags; 1221 unsigned int req_cpu = cpu;
1222
1223 /*
1224 * While a work item is PENDING && off queue, a task trying to
1225 * steal the PENDING will busy-loop waiting for it to either get
1226 * queued or lose PENDING. Grabbing PENDING and queueing should
1227 * happen with IRQ disabled.
1228 */
1229 WARN_ON_ONCE(!irqs_disabled());
986 1230
987 debug_work_activate(work); 1231 debug_work_activate(work);
988 1232
@@ -995,21 +1239,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
995 if (!(wq->flags & WQ_UNBOUND)) { 1239 if (!(wq->flags & WQ_UNBOUND)) {
996 struct global_cwq *last_gcwq; 1240 struct global_cwq *last_gcwq;
997 1241
998 if (unlikely(cpu == WORK_CPU_UNBOUND)) 1242 if (cpu == WORK_CPU_UNBOUND)
999 cpu = raw_smp_processor_id(); 1243 cpu = raw_smp_processor_id();
1000 1244
1001 /* 1245 /*
1002 * It's multi cpu. If @wq is non-reentrant and @work 1246 * It's multi cpu. If @work was previously on a different
1003 * was previously on a different cpu, it might still 1247 * cpu, it might still be running there, in which case the
1004 * be running there, in which case the work needs to 1248 * work needs to be queued on that cpu to guarantee
1005 * be queued on that cpu to guarantee non-reentrance. 1249 * non-reentrancy.
1006 */ 1250 */
1007 gcwq = get_gcwq(cpu); 1251 gcwq = get_gcwq(cpu);
1008 if (wq->flags & WQ_NON_REENTRANT && 1252 last_gcwq = get_work_gcwq(work);
1009 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { 1253
1254 if (last_gcwq && last_gcwq != gcwq) {
1010 struct worker *worker; 1255 struct worker *worker;
1011 1256
1012 spin_lock_irqsave(&last_gcwq->lock, flags); 1257 spin_lock(&last_gcwq->lock);
1013 1258
1014 worker = find_worker_executing_work(last_gcwq, work); 1259 worker = find_worker_executing_work(last_gcwq, work);
1015 1260
@@ -1017,22 +1262,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1017 gcwq = last_gcwq; 1262 gcwq = last_gcwq;
1018 else { 1263 else {
1019 /* meh... not running there, queue here */ 1264 /* meh... not running there, queue here */
1020 spin_unlock_irqrestore(&last_gcwq->lock, flags); 1265 spin_unlock(&last_gcwq->lock);
1021 spin_lock_irqsave(&gcwq->lock, flags); 1266 spin_lock(&gcwq->lock);
1022 } 1267 }
1023 } else 1268 } else {
1024 spin_lock_irqsave(&gcwq->lock, flags); 1269 spin_lock(&gcwq->lock);
1270 }
1025 } else { 1271 } else {
1026 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1272 gcwq = get_gcwq(WORK_CPU_UNBOUND);
1027 spin_lock_irqsave(&gcwq->lock, flags); 1273 spin_lock(&gcwq->lock);
1028 } 1274 }
1029 1275
1030 /* gcwq determined, get cwq and queue */ 1276 /* gcwq determined, get cwq and queue */
1031 cwq = get_cwq(gcwq->cpu, wq); 1277 cwq = get_cwq(gcwq->cpu, wq);
1032 trace_workqueue_queue_work(cpu, cwq, work); 1278 trace_workqueue_queue_work(req_cpu, cwq, work);
1033 1279
1034 if (WARN_ON(!list_empty(&work->entry))) { 1280 if (WARN_ON(!list_empty(&work->entry))) {
1035 spin_unlock_irqrestore(&gcwq->lock, flags); 1281 spin_unlock(&gcwq->lock);
1036 return; 1282 return;
1037 } 1283 }
1038 1284
@@ -1050,134 +1296,220 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1050 1296
1051 insert_work(cwq, work, worklist, work_flags); 1297 insert_work(cwq, work, worklist, work_flags);
1052 1298
1053 spin_unlock_irqrestore(&gcwq->lock, flags); 1299 spin_unlock(&gcwq->lock);
1054} 1300}
1055 1301
1056/** 1302/**
1057 * queue_work - queue work on a workqueue 1303 * queue_work_on - queue work on specific cpu
1304 * @cpu: CPU number to execute work on
1058 * @wq: workqueue to use 1305 * @wq: workqueue to use
1059 * @work: work to queue 1306 * @work: work to queue
1060 * 1307 *
1061 * Returns 0 if @work was already on a queue, non-zero otherwise. 1308 * Returns %false if @work was already on a queue, %true otherwise.
1062 * 1309 *
1063 * We queue the work to the CPU on which it was submitted, but if the CPU dies 1310 * We queue the work to a specific CPU, the caller must ensure it
1064 * it can be processed by another CPU. 1311 * can't go away.
1065 */ 1312 */
1066int queue_work(struct workqueue_struct *wq, struct work_struct *work) 1313bool queue_work_on(int cpu, struct workqueue_struct *wq,
1314 struct work_struct *work)
1067{ 1315{
1068 int ret; 1316 bool ret = false;
1317 unsigned long flags;
1069 1318
1070 ret = queue_work_on(get_cpu(), wq, work); 1319 local_irq_save(flags);
1071 put_cpu(); 1320
1321 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1322 __queue_work(cpu, wq, work);
1323 ret = true;
1324 }
1072 1325
1326 local_irq_restore(flags);
1073 return ret; 1327 return ret;
1074} 1328}
1075EXPORT_SYMBOL_GPL(queue_work); 1329EXPORT_SYMBOL_GPL(queue_work_on);
1076 1330
1077/** 1331/**
1078 * queue_work_on - queue work on specific cpu 1332 * queue_work - queue work on a workqueue
1079 * @cpu: CPU number to execute work on
1080 * @wq: workqueue to use 1333 * @wq: workqueue to use
1081 * @work: work to queue 1334 * @work: work to queue
1082 * 1335 *
1083 * Returns 0 if @work was already on a queue, non-zero otherwise. 1336 * Returns %false if @work was already on a queue, %true otherwise.
1084 * 1337 *
1085 * We queue the work to a specific CPU, the caller must ensure it 1338 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1086 * can't go away. 1339 * it can be processed by another CPU.
1087 */ 1340 */
1088int 1341bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1089queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1090{ 1342{
1091 int ret = 0; 1343 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1092
1093 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1094 __queue_work(cpu, wq, work);
1095 ret = 1;
1096 }
1097 return ret;
1098} 1344}
1099EXPORT_SYMBOL_GPL(queue_work_on); 1345EXPORT_SYMBOL_GPL(queue_work);
1100 1346
1101static void delayed_work_timer_fn(unsigned long __data) 1347void delayed_work_timer_fn(unsigned long __data)
1102{ 1348{
1103 struct delayed_work *dwork = (struct delayed_work *)__data; 1349 struct delayed_work *dwork = (struct delayed_work *)__data;
1104 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); 1350 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1105 1351
1106 __queue_work(smp_processor_id(), cwq->wq, &dwork->work); 1352 /* should have been called from irqsafe timer with irq already off */
1353 __queue_work(dwork->cpu, cwq->wq, &dwork->work);
1354}
1355EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
1356
1357static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1358 struct delayed_work *dwork, unsigned long delay)
1359{
1360 struct timer_list *timer = &dwork->timer;
1361 struct work_struct *work = &dwork->work;
1362 unsigned int lcpu;
1363
1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1365 timer->data != (unsigned long)dwork);
1366 WARN_ON_ONCE(timer_pending(timer));
1367 WARN_ON_ONCE(!list_empty(&work->entry));
1368
1369 /*
1370 * If @delay is 0, queue @dwork->work immediately. This is for
1371 * both optimization and correctness. The earliest @timer can
1372 * expire is on the closest next tick and delayed_work users depend
1373 * on that there's no such delay when @delay is 0.
1374 */
1375 if (!delay) {
1376 __queue_work(cpu, wq, &dwork->work);
1377 return;
1378 }
1379
1380 timer_stats_timer_set_start_info(&dwork->timer);
1381
1382 /*
1383 * This stores cwq for the moment, for the timer_fn. Note that the
1384 * work's gcwq is preserved to allow reentrance detection for
1385 * delayed works.
1386 */
1387 if (!(wq->flags & WQ_UNBOUND)) {
1388 struct global_cwq *gcwq = get_work_gcwq(work);
1389
1390 /*
1391 * If we cannot get the last gcwq from @work directly,
1392 * select the last CPU such that it avoids unnecessarily
1393 * triggering non-reentrancy check in __queue_work().
1394 */
1395 lcpu = cpu;
1396 if (gcwq)
1397 lcpu = gcwq->cpu;
1398 if (lcpu == WORK_CPU_UNBOUND)
1399 lcpu = raw_smp_processor_id();
1400 } else {
1401 lcpu = WORK_CPU_UNBOUND;
1402 }
1403
1404 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1405
1406 dwork->cpu = cpu;
1407 timer->expires = jiffies + delay;
1408
1409 if (unlikely(cpu != WORK_CPU_UNBOUND))
1410 add_timer_on(timer, cpu);
1411 else
1412 add_timer(timer);
1107} 1413}
1108 1414
1109/** 1415/**
1416 * queue_delayed_work_on - queue work on specific CPU after delay
1417 * @cpu: CPU number to execute work on
1418 * @wq: workqueue to use
1419 * @dwork: work to queue
1420 * @delay: number of jiffies to wait before queueing
1421 *
1422 * Returns %false if @work was already on a queue, %true otherwise. If
1423 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1424 * execution.
1425 */
1426bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1427 struct delayed_work *dwork, unsigned long delay)
1428{
1429 struct work_struct *work = &dwork->work;
1430 bool ret = false;
1431 unsigned long flags;
1432
1433 /* read the comment in __queue_work() */
1434 local_irq_save(flags);
1435
1436 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1437 __queue_delayed_work(cpu, wq, dwork, delay);
1438 ret = true;
1439 }
1440
1441 local_irq_restore(flags);
1442 return ret;
1443}
1444EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1445
1446/**
1110 * queue_delayed_work - queue work on a workqueue after delay 1447 * queue_delayed_work - queue work on a workqueue after delay
1111 * @wq: workqueue to use 1448 * @wq: workqueue to use
1112 * @dwork: delayable work to queue 1449 * @dwork: delayable work to queue
1113 * @delay: number of jiffies to wait before queueing 1450 * @delay: number of jiffies to wait before queueing
1114 * 1451 *
1115 * Returns 0 if @work was already on a queue, non-zero otherwise. 1452 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1116 */ 1453 */
1117int queue_delayed_work(struct workqueue_struct *wq, 1454bool queue_delayed_work(struct workqueue_struct *wq,
1118 struct delayed_work *dwork, unsigned long delay) 1455 struct delayed_work *dwork, unsigned long delay)
1119{ 1456{
1120 if (delay == 0) 1457 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1121 return queue_work(wq, &dwork->work);
1122
1123 return queue_delayed_work_on(-1, wq, dwork, delay);
1124} 1458}
1125EXPORT_SYMBOL_GPL(queue_delayed_work); 1459EXPORT_SYMBOL_GPL(queue_delayed_work);
1126 1460
1127/** 1461/**
1128 * queue_delayed_work_on - queue work on specific CPU after delay 1462 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1129 * @cpu: CPU number to execute work on 1463 * @cpu: CPU number to execute work on
1130 * @wq: workqueue to use 1464 * @wq: workqueue to use
1131 * @dwork: work to queue 1465 * @dwork: work to queue
1132 * @delay: number of jiffies to wait before queueing 1466 * @delay: number of jiffies to wait before queueing
1133 * 1467 *
1134 * Returns 0 if @work was already on a queue, non-zero otherwise. 1468 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1469 * modify @dwork's timer so that it expires after @delay. If @delay is
1470 * zero, @work is guaranteed to be scheduled immediately regardless of its
1471 * current state.
1472 *
1473 * Returns %false if @dwork was idle and queued, %true if @dwork was
1474 * pending and its timer was modified.
1475 *
1476 * This function is safe to call from any context including IRQ handler.
1477 * See try_to_grab_pending() for details.
1135 */ 1478 */
1136int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 1479bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1137 struct delayed_work *dwork, unsigned long delay) 1480 struct delayed_work *dwork, unsigned long delay)
1138{ 1481{
1139 int ret = 0; 1482 unsigned long flags;
1140 struct timer_list *timer = &dwork->timer; 1483 int ret;
1141 struct work_struct *work = &dwork->work;
1142
1143 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1144 unsigned int lcpu;
1145
1146 BUG_ON(timer_pending(timer));
1147 BUG_ON(!list_empty(&work->entry));
1148
1149 timer_stats_timer_set_start_info(&dwork->timer);
1150
1151 /*
1152 * This stores cwq for the moment, for the timer_fn.
1153 * Note that the work's gcwq is preserved to allow
1154 * reentrance detection for delayed works.
1155 */
1156 if (!(wq->flags & WQ_UNBOUND)) {
1157 struct global_cwq *gcwq = get_work_gcwq(work);
1158
1159 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1160 lcpu = gcwq->cpu;
1161 else
1162 lcpu = raw_smp_processor_id();
1163 } else
1164 lcpu = WORK_CPU_UNBOUND;
1165
1166 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1167 1484
1168 timer->expires = jiffies + delay; 1485 do {
1169 timer->data = (unsigned long)dwork; 1486 ret = try_to_grab_pending(&dwork->work, true, &flags);
1170 timer->function = delayed_work_timer_fn; 1487 } while (unlikely(ret == -EAGAIN));
1171 1488
1172 if (unlikely(cpu >= 0)) 1489 if (likely(ret >= 0)) {
1173 add_timer_on(timer, cpu); 1490 __queue_delayed_work(cpu, wq, dwork, delay);
1174 else 1491 local_irq_restore(flags);
1175 add_timer(timer);
1176 ret = 1;
1177 } 1492 }
1493
1494 /* -ENOENT from try_to_grab_pending() becomes %true */
1178 return ret; 1495 return ret;
1179} 1496}
1180EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1497EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1498
1499/**
1500 * mod_delayed_work - modify delay of or queue a delayed work
1501 * @wq: workqueue to use
1502 * @dwork: work to queue
1503 * @delay: number of jiffies to wait before queueing
1504 *
1505 * mod_delayed_work_on() on local CPU.
1506 */
1507bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1508 unsigned long delay)
1509{
1510 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1511}
1512EXPORT_SYMBOL_GPL(mod_delayed_work);
1181 1513
1182/** 1514/**
1183 * worker_enter_idle - enter idle state 1515 * worker_enter_idle - enter idle state
@@ -1305,37 +1637,21 @@ __acquires(&gcwq->lock)
1305 } 1637 }
1306} 1638}
1307 1639
1308struct idle_rebind {
1309 int cnt; /* # workers to be rebound */
1310 struct completion done; /* all workers rebound */
1311};
1312
1313/* 1640/*
1314 * Rebind an idle @worker to its CPU. During CPU onlining, this has to 1641 * Rebind an idle @worker to its CPU. worker_thread() will test
1315 * happen synchronously for idle workers. worker_thread() will test 1642 * list_empty(@worker->entry) before leaving idle and call this function.
1316 * %WORKER_REBIND before leaving idle and call this function.
1317 */ 1643 */
1318static void idle_worker_rebind(struct worker *worker) 1644static void idle_worker_rebind(struct worker *worker)
1319{ 1645{
1320 struct global_cwq *gcwq = worker->pool->gcwq; 1646 struct global_cwq *gcwq = worker->pool->gcwq;
1321 1647
1322 /* CPU must be online at this point */ 1648 /* CPU may go down again inbetween, clear UNBOUND only on success */
1323 WARN_ON(!worker_maybe_bind_and_lock(worker)); 1649 if (worker_maybe_bind_and_lock(worker))
1324 if (!--worker->idle_rebind->cnt) 1650 worker_clr_flags(worker, WORKER_UNBOUND);
1325 complete(&worker->idle_rebind->done);
1326 spin_unlock_irq(&worker->pool->gcwq->lock);
1327 1651
1328 /* we did our part, wait for rebind_workers() to finish up */ 1652 /* rebind complete, become available again */
1329 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); 1653 list_add(&worker->entry, &worker->pool->idle_list);
1330 1654 spin_unlock_irq(&gcwq->lock);
1331 /*
1332 * rebind_workers() shouldn't finish until all workers passed the
1333 * above WORKER_REBIND wait. Tell it when done.
1334 */
1335 spin_lock_irq(&worker->pool->gcwq->lock);
1336 if (!--worker->idle_rebind->cnt)
1337 complete(&worker->idle_rebind->done);
1338 spin_unlock_irq(&worker->pool->gcwq->lock);
1339} 1655}
1340 1656
1341/* 1657/*
@@ -1349,16 +1665,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1349 struct worker *worker = container_of(work, struct worker, rebind_work); 1665 struct worker *worker = container_of(work, struct worker, rebind_work);
1350 struct global_cwq *gcwq = worker->pool->gcwq; 1666 struct global_cwq *gcwq = worker->pool->gcwq;
1351 1667
1352 worker_maybe_bind_and_lock(worker); 1668 if (worker_maybe_bind_and_lock(worker))
1353 1669 worker_clr_flags(worker, WORKER_UNBOUND);
1354 /*
1355 * %WORKER_REBIND must be cleared even if the above binding failed;
1356 * otherwise, we may confuse the next CPU_UP cycle or oops / get
1357 * stuck by calling idle_worker_rebind() prematurely. If CPU went
1358 * down again inbetween, %WORKER_UNBOUND would be set, so clearing
1359 * %WORKER_REBIND is always safe.
1360 */
1361 worker_clr_flags(worker, WORKER_REBIND);
1362 1670
1363 spin_unlock_irq(&gcwq->lock); 1671 spin_unlock_irq(&gcwq->lock);
1364} 1672}
@@ -1370,123 +1678,74 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1370 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1678 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1371 * is different for idle and busy ones. 1679 * is different for idle and busy ones.
1372 * 1680 *
1373 * The idle ones should be rebound synchronously and idle rebinding should 1681 * Idle ones will be removed from the idle_list and woken up. They will
1374 * be complete before any worker starts executing work items with 1682 * add themselves back after completing rebind. This ensures that the
1375 * concurrency management enabled; otherwise, scheduler may oops trying to 1683 * idle_list doesn't contain any unbound workers when re-bound busy workers
1376 * wake up non-local idle worker from wq_worker_sleeping(). 1684 * try to perform local wake-ups for concurrency management.
1377 * 1685 *
1378 * This is achieved by repeatedly requesting rebinding until all idle 1686 * Busy workers can rebind after they finish their current work items.
1379 * workers are known to have been rebound under @gcwq->lock and holding all 1687 * Queueing the rebind work item at the head of the scheduled list is
1380 * idle workers from becoming busy until idle rebinding is complete. 1688 * enough. Note that nr_running will be properly bumped as busy workers
1689 * rebind.
1381 * 1690 *
1382 * Once idle workers are rebound, busy workers can be rebound as they 1691 * On return, all non-manager workers are scheduled for rebind - see
1383 * finish executing their current work items. Queueing the rebind work at 1692 * manage_workers() for the manager special case. Any idle worker
1384 * the head of their scheduled lists is enough. Note that nr_running will 1693 * including the manager will not appear on @idle_list until rebind is
1385 * be properbly bumped as busy workers rebind. 1694 * complete, making local wake-ups safe.
1386 *
1387 * On return, all workers are guaranteed to either be bound or have rebind
1388 * work item scheduled.
1389 */ 1695 */
1390static void rebind_workers(struct global_cwq *gcwq) 1696static void rebind_workers(struct global_cwq *gcwq)
1391 __releases(&gcwq->lock) __acquires(&gcwq->lock)
1392{ 1697{
1393 struct idle_rebind idle_rebind;
1394 struct worker_pool *pool; 1698 struct worker_pool *pool;
1395 struct worker *worker; 1699 struct worker *worker, *n;
1396 struct hlist_node *pos; 1700 struct hlist_node *pos;
1397 int i; 1701 int i;
1398 1702
1399 lockdep_assert_held(&gcwq->lock); 1703 lockdep_assert_held(&gcwq->lock);
1400 1704
1401 for_each_worker_pool(pool, gcwq) 1705 for_each_worker_pool(pool, gcwq)
1402 lockdep_assert_held(&pool->manager_mutex); 1706 lockdep_assert_held(&pool->assoc_mutex);
1403 1707
1404 /* 1708 /* dequeue and kick idle ones */
1405 * Rebind idle workers. Interlocked both ways. We wait for
1406 * workers to rebind via @idle_rebind.done. Workers will wait for
1407 * us to finish up by watching %WORKER_REBIND.
1408 */
1409 init_completion(&idle_rebind.done);
1410retry:
1411 idle_rebind.cnt = 1;
1412 INIT_COMPLETION(idle_rebind.done);
1413
1414 /* set REBIND and kick idle ones, we'll wait for these later */
1415 for_each_worker_pool(pool, gcwq) { 1709 for_each_worker_pool(pool, gcwq) {
1416 list_for_each_entry(worker, &pool->idle_list, entry) { 1710 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1417 unsigned long worker_flags = worker->flags; 1711 /*
1418 1712 * idle workers should be off @pool->idle_list
1419 if (worker->flags & WORKER_REBIND) 1713 * until rebind is complete to avoid receiving
1420 continue; 1714 * premature local wake-ups.
1421 1715 */
1422 /* morph UNBOUND to REBIND atomically */ 1716 list_del_init(&worker->entry);
1423 worker_flags &= ~WORKER_UNBOUND;
1424 worker_flags |= WORKER_REBIND;
1425 ACCESS_ONCE(worker->flags) = worker_flags;
1426
1427 idle_rebind.cnt++;
1428 worker->idle_rebind = &idle_rebind;
1429 1717
1430 /* worker_thread() will call idle_worker_rebind() */ 1718 /*
1719 * worker_thread() will see the above dequeuing
1720 * and call idle_worker_rebind().
1721 */
1431 wake_up_process(worker->task); 1722 wake_up_process(worker->task);
1432 } 1723 }
1433 } 1724 }
1434 1725
1435 if (--idle_rebind.cnt) { 1726 /* rebind busy workers */
1436 spin_unlock_irq(&gcwq->lock);
1437 wait_for_completion(&idle_rebind.done);
1438 spin_lock_irq(&gcwq->lock);
1439 /* busy ones might have become idle while waiting, retry */
1440 goto retry;
1441 }
1442
1443 /* all idle workers are rebound, rebind busy workers */
1444 for_each_busy_worker(worker, i, pos, gcwq) { 1727 for_each_busy_worker(worker, i, pos, gcwq) {
1445 struct work_struct *rebind_work = &worker->rebind_work; 1728 struct work_struct *rebind_work = &worker->rebind_work;
1446 unsigned long worker_flags = worker->flags; 1729 struct workqueue_struct *wq;
1447
1448 /* morph UNBOUND to REBIND atomically */
1449 worker_flags &= ~WORKER_UNBOUND;
1450 worker_flags |= WORKER_REBIND;
1451 ACCESS_ONCE(worker->flags) = worker_flags;
1452 1730
1453 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, 1731 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1454 work_data_bits(rebind_work))) 1732 work_data_bits(rebind_work)))
1455 continue; 1733 continue;
1456 1734
1457 /* wq doesn't matter, use the default one */
1458 debug_work_activate(rebind_work); 1735 debug_work_activate(rebind_work);
1459 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
1460 worker->scheduled.next,
1461 work_color_to_flags(WORK_NO_COLOR));
1462 }
1463
1464 /*
1465 * All idle workers are rebound and waiting for %WORKER_REBIND to
1466 * be cleared inside idle_worker_rebind(). Clear and release.
1467 * Clearing %WORKER_REBIND from this foreign context is safe
1468 * because these workers are still guaranteed to be idle.
1469 *
1470 * We need to make sure all idle workers passed WORKER_REBIND wait
1471 * in idle_worker_rebind() before returning; otherwise, workers can
1472 * get stuck at the wait if hotplug cycle repeats.
1473 */
1474 idle_rebind.cnt = 1;
1475 INIT_COMPLETION(idle_rebind.done);
1476 1736
1477 for_each_worker_pool(pool, gcwq) { 1737 /*
1478 list_for_each_entry(worker, &pool->idle_list, entry) { 1738 * wq doesn't really matter but let's keep @worker->pool
1479 worker->flags &= ~WORKER_REBIND; 1739 * and @cwq->pool consistent for sanity.
1480 idle_rebind.cnt++; 1740 */
1481 } 1741 if (worker_pool_pri(worker->pool))
1482 } 1742 wq = system_highpri_wq;
1483 1743 else
1484 wake_up_all(&gcwq->rebind_hold); 1744 wq = system_wq;
1485 1745
1486 if (--idle_rebind.cnt) { 1746 insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
1487 spin_unlock_irq(&gcwq->lock); 1747 worker->scheduled.next,
1488 wait_for_completion(&idle_rebind.done); 1748 work_color_to_flags(WORK_NO_COLOR));
1489 spin_lock_irq(&gcwq->lock);
1490 } 1749 }
1491} 1750}
1492 1751
@@ -1844,22 +2103,22 @@ static bool manage_workers(struct worker *worker)
1844 * grab %POOL_MANAGING_WORKERS to achieve this because that can 2103 * grab %POOL_MANAGING_WORKERS to achieve this because that can
1845 * lead to idle worker depletion (all become busy thinking someone 2104 * lead to idle worker depletion (all become busy thinking someone
1846 * else is managing) which in turn can result in deadlock under 2105 * else is managing) which in turn can result in deadlock under
1847 * extreme circumstances. Use @pool->manager_mutex to synchronize 2106 * extreme circumstances. Use @pool->assoc_mutex to synchronize
1848 * manager against CPU hotplug. 2107 * manager against CPU hotplug.
1849 * 2108 *
1850 * manager_mutex would always be free unless CPU hotplug is in 2109 * assoc_mutex would always be free unless CPU hotplug is in
1851 * progress. trylock first without dropping @gcwq->lock. 2110 * progress. trylock first without dropping @gcwq->lock.
1852 */ 2111 */
1853 if (unlikely(!mutex_trylock(&pool->manager_mutex))) { 2112 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
1854 spin_unlock_irq(&pool->gcwq->lock); 2113 spin_unlock_irq(&pool->gcwq->lock);
1855 mutex_lock(&pool->manager_mutex); 2114 mutex_lock(&pool->assoc_mutex);
1856 /* 2115 /*
1857 * CPU hotplug could have happened while we were waiting 2116 * CPU hotplug could have happened while we were waiting
1858 * for manager_mutex. Hotplug itself can't handle us 2117 * for assoc_mutex. Hotplug itself can't handle us
1859 * because manager isn't either on idle or busy list, and 2118 * because manager isn't either on idle or busy list, and
1860 * @gcwq's state and ours could have deviated. 2119 * @gcwq's state and ours could have deviated.
1861 * 2120 *
1862 * As hotplug is now excluded via manager_mutex, we can 2121 * As hotplug is now excluded via assoc_mutex, we can
1863 * simply try to bind. It will succeed or fail depending 2122 * simply try to bind. It will succeed or fail depending
1864 * on @gcwq's current state. Try it and adjust 2123 * on @gcwq's current state. Try it and adjust
1865 * %WORKER_UNBOUND accordingly. 2124 * %WORKER_UNBOUND accordingly.
@@ -1882,112 +2141,11 @@ static bool manage_workers(struct worker *worker)
1882 ret |= maybe_create_worker(pool); 2141 ret |= maybe_create_worker(pool);
1883 2142
1884 pool->flags &= ~POOL_MANAGING_WORKERS; 2143 pool->flags &= ~POOL_MANAGING_WORKERS;
1885 mutex_unlock(&pool->manager_mutex); 2144 mutex_unlock(&pool->assoc_mutex);
1886 return ret; 2145 return ret;
1887} 2146}
1888 2147
1889/** 2148/**
1890 * move_linked_works - move linked works to a list
1891 * @work: start of series of works to be scheduled
1892 * @head: target list to append @work to
1893 * @nextp: out paramter for nested worklist walking
1894 *
1895 * Schedule linked works starting from @work to @head. Work series to
1896 * be scheduled starts at @work and includes any consecutive work with
1897 * WORK_STRUCT_LINKED set in its predecessor.
1898 *
1899 * If @nextp is not NULL, it's updated to point to the next work of
1900 * the last scheduled work. This allows move_linked_works() to be
1901 * nested inside outer list_for_each_entry_safe().
1902 *
1903 * CONTEXT:
1904 * spin_lock_irq(gcwq->lock).
1905 */
1906static void move_linked_works(struct work_struct *work, struct list_head *head,
1907 struct work_struct **nextp)
1908{
1909 struct work_struct *n;
1910
1911 /*
1912 * Linked worklist will always end before the end of the list,
1913 * use NULL for list head.
1914 */
1915 list_for_each_entry_safe_from(work, n, NULL, entry) {
1916 list_move_tail(&work->entry, head);
1917 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1918 break;
1919 }
1920
1921 /*
1922 * If we're already inside safe list traversal and have moved
1923 * multiple works to the scheduled queue, the next position
1924 * needs to be updated.
1925 */
1926 if (nextp)
1927 *nextp = n;
1928}
1929
1930static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1931{
1932 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1933 struct work_struct, entry);
1934
1935 trace_workqueue_activate_work(work);
1936 move_linked_works(work, &cwq->pool->worklist, NULL);
1937 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1938 cwq->nr_active++;
1939}
1940
1941/**
1942 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1943 * @cwq: cwq of interest
1944 * @color: color of work which left the queue
1945 * @delayed: for a delayed work
1946 *
1947 * A work either has completed or is removed from pending queue,
1948 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1949 *
1950 * CONTEXT:
1951 * spin_lock_irq(gcwq->lock).
1952 */
1953static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1954 bool delayed)
1955{
1956 /* ignore uncolored works */
1957 if (color == WORK_NO_COLOR)
1958 return;
1959
1960 cwq->nr_in_flight[color]--;
1961
1962 if (!delayed) {
1963 cwq->nr_active--;
1964 if (!list_empty(&cwq->delayed_works)) {
1965 /* one down, submit a delayed one */
1966 if (cwq->nr_active < cwq->max_active)
1967 cwq_activate_first_delayed(cwq);
1968 }
1969 }
1970
1971 /* is flush in progress and are we at the flushing tip? */
1972 if (likely(cwq->flush_color != color))
1973 return;
1974
1975 /* are there still in-flight works? */
1976 if (cwq->nr_in_flight[color])
1977 return;
1978
1979 /* this cwq is done, clear flush_color */
1980 cwq->flush_color = -1;
1981
1982 /*
1983 * If this was the last cwq, wake up the first flusher. It
1984 * will handle the rest.
1985 */
1986 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1987 complete(&cwq->wq->first_flusher->done);
1988}
1989
1990/**
1991 * process_one_work - process single work 2149 * process_one_work - process single work
1992 * @worker: self 2150 * @worker: self
1993 * @work: work to process 2151 * @work: work to process
@@ -2030,7 +2188,7 @@ __acquires(&gcwq->lock)
2030 * necessary to avoid spurious warnings from rescuers servicing the 2188 * necessary to avoid spurious warnings from rescuers servicing the
2031 * unbound or a disassociated gcwq. 2189 * unbound or a disassociated gcwq.
2032 */ 2190 */
2033 WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && 2191 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2034 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2192 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
2035 raw_smp_processor_id() != gcwq->cpu); 2193 raw_smp_processor_id() != gcwq->cpu);
2036 2194
@@ -2046,15 +2204,13 @@ __acquires(&gcwq->lock)
2046 return; 2204 return;
2047 } 2205 }
2048 2206
2049 /* claim and process */ 2207 /* claim and dequeue */
2050 debug_work_deactivate(work); 2208 debug_work_deactivate(work);
2051 hlist_add_head(&worker->hentry, bwh); 2209 hlist_add_head(&worker->hentry, bwh);
2052 worker->current_work = work; 2210 worker->current_work = work;
2053 worker->current_cwq = cwq; 2211 worker->current_cwq = cwq;
2054 work_color = get_work_color(work); 2212 work_color = get_work_color(work);
2055 2213
2056 /* record the current cpu number in the work data and dequeue */
2057 set_work_cpu(work, gcwq->cpu);
2058 list_del_init(&work->entry); 2214 list_del_init(&work->entry);
2059 2215
2060 /* 2216 /*
@@ -2071,9 +2227,16 @@ __acquires(&gcwq->lock)
2071 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2227 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2072 wake_up_worker(pool); 2228 wake_up_worker(pool);
2073 2229
2230 /*
2231 * Record the last CPU and clear PENDING which should be the last
2232 * update to @work. Also, do this inside @gcwq->lock so that
2233 * PENDING and queued state changes happen together while IRQ is
2234 * disabled.
2235 */
2236 set_work_cpu_and_clear_pending(work, gcwq->cpu);
2237
2074 spin_unlock_irq(&gcwq->lock); 2238 spin_unlock_irq(&gcwq->lock);
2075 2239
2076 work_clear_pending(work);
2077 lock_map_acquire_read(&cwq->wq->lockdep_map); 2240 lock_map_acquire_read(&cwq->wq->lockdep_map);
2078 lock_map_acquire(&lockdep_map); 2241 lock_map_acquire(&lockdep_map);
2079 trace_workqueue_execute_start(work); 2242 trace_workqueue_execute_start(work);
@@ -2087,11 +2250,9 @@ __acquires(&gcwq->lock)
2087 lock_map_release(&cwq->wq->lockdep_map); 2250 lock_map_release(&cwq->wq->lockdep_map);
2088 2251
2089 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2252 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2090 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 2253 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2091 "%s/0x%08x/%d\n", 2254 " last function: %pf\n",
2092 current->comm, preempt_count(), task_pid_nr(current)); 2255 current->comm, preempt_count(), task_pid_nr(current), f);
2093 printk(KERN_ERR " last function: ");
2094 print_symbol("%s\n", (unsigned long)f);
2095 debug_show_held_locks(current); 2256 debug_show_held_locks(current);
2096 dump_stack(); 2257 dump_stack();
2097 } 2258 }
@@ -2106,7 +2267,7 @@ __acquires(&gcwq->lock)
2106 hlist_del_init(&worker->hentry); 2267 hlist_del_init(&worker->hentry);
2107 worker->current_work = NULL; 2268 worker->current_work = NULL;
2108 worker->current_cwq = NULL; 2269 worker->current_cwq = NULL;
2109 cwq_dec_nr_in_flight(cwq, work_color, false); 2270 cwq_dec_nr_in_flight(cwq, work_color);
2110} 2271}
2111 2272
2112/** 2273/**
@@ -2151,18 +2312,17 @@ static int worker_thread(void *__worker)
2151woke_up: 2312woke_up:
2152 spin_lock_irq(&gcwq->lock); 2313 spin_lock_irq(&gcwq->lock);
2153 2314
2154 /* 2315 /* we are off idle list if destruction or rebind is requested */
2155 * DIE can be set only while idle and REBIND set while busy has 2316 if (unlikely(list_empty(&worker->entry))) {
2156 * @worker->rebind_work scheduled. Checking here is enough.
2157 */
2158 if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
2159 spin_unlock_irq(&gcwq->lock); 2317 spin_unlock_irq(&gcwq->lock);
2160 2318
2319 /* if DIE is set, destruction is requested */
2161 if (worker->flags & WORKER_DIE) { 2320 if (worker->flags & WORKER_DIE) {
2162 worker->task->flags &= ~PF_WQ_WORKER; 2321 worker->task->flags &= ~PF_WQ_WORKER;
2163 return 0; 2322 return 0;
2164 } 2323 }
2165 2324
2325 /* otherwise, rebind */
2166 idle_worker_rebind(worker); 2326 idle_worker_rebind(worker);
2167 goto woke_up; 2327 goto woke_up;
2168 } 2328 }
@@ -2257,8 +2417,10 @@ static int rescuer_thread(void *__wq)
2257repeat: 2417repeat:
2258 set_current_state(TASK_INTERRUPTIBLE); 2418 set_current_state(TASK_INTERRUPTIBLE);
2259 2419
2260 if (kthread_should_stop()) 2420 if (kthread_should_stop()) {
2421 __set_current_state(TASK_RUNNING);
2261 return 0; 2422 return 0;
2423 }
2262 2424
2263 /* 2425 /*
2264 * See whether any cpu is asking for help. Unbounded 2426 * See whether any cpu is asking for help. Unbounded
@@ -2645,8 +2807,8 @@ reflush:
2645 2807
2646 if (++flush_cnt == 10 || 2808 if (++flush_cnt == 10 ||
2647 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2809 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2648 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", 2810 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
2649 wq->name, flush_cnt); 2811 wq->name, flush_cnt);
2650 goto reflush; 2812 goto reflush;
2651 } 2813 }
2652 2814
@@ -2657,8 +2819,7 @@ reflush:
2657} 2819}
2658EXPORT_SYMBOL_GPL(drain_workqueue); 2820EXPORT_SYMBOL_GPL(drain_workqueue);
2659 2821
2660static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2822static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2661 bool wait_executing)
2662{ 2823{
2663 struct worker *worker = NULL; 2824 struct worker *worker = NULL;
2664 struct global_cwq *gcwq; 2825 struct global_cwq *gcwq;
@@ -2680,13 +2841,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2680 cwq = get_work_cwq(work); 2841 cwq = get_work_cwq(work);
2681 if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) 2842 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2682 goto already_gone; 2843 goto already_gone;
2683 } else if (wait_executing) { 2844 } else {
2684 worker = find_worker_executing_work(gcwq, work); 2845 worker = find_worker_executing_work(gcwq, work);
2685 if (!worker) 2846 if (!worker)
2686 goto already_gone; 2847 goto already_gone;
2687 cwq = worker->current_cwq; 2848 cwq = worker->current_cwq;
2688 } else 2849 }
2689 goto already_gone;
2690 2850
2691 insert_wq_barrier(cwq, barr, work, worker); 2851 insert_wq_barrier(cwq, barr, work, worker);
2692 spin_unlock_irq(&gcwq->lock); 2852 spin_unlock_irq(&gcwq->lock);
@@ -2713,15 +2873,8 @@ already_gone:
2713 * flush_work - wait for a work to finish executing the last queueing instance 2873 * flush_work - wait for a work to finish executing the last queueing instance
2714 * @work: the work to flush 2874 * @work: the work to flush
2715 * 2875 *
2716 * Wait until @work has finished execution. This function considers 2876 * Wait until @work has finished execution. @work is guaranteed to be idle
2717 * only the last queueing instance of @work. If @work has been 2877 * on return if it hasn't been requeued since flush started.
2718 * enqueued across different CPUs on a non-reentrant workqueue or on
2719 * multiple workqueues, @work might still be executing on return on
2720 * some of the CPUs from earlier queueing.
2721 *
2722 * If @work was queued only on a non-reentrant, ordered or unbound
2723 * workqueue, @work is guaranteed to be idle on return if it hasn't
2724 * been requeued since flush started.
2725 * 2878 *
2726 * RETURNS: 2879 * RETURNS:
2727 * %true if flush_work() waited for the work to finish execution, 2880 * %true if flush_work() waited for the work to finish execution,
@@ -2734,140 +2887,36 @@ bool flush_work(struct work_struct *work)
2734 lock_map_acquire(&work->lockdep_map); 2887 lock_map_acquire(&work->lockdep_map);
2735 lock_map_release(&work->lockdep_map); 2888 lock_map_release(&work->lockdep_map);
2736 2889
2737 if (start_flush_work(work, &barr, true)) { 2890 if (start_flush_work(work, &barr)) {
2738 wait_for_completion(&barr.done); 2891 wait_for_completion(&barr.done);
2739 destroy_work_on_stack(&barr.work); 2892 destroy_work_on_stack(&barr.work);
2740 return true; 2893 return true;
2741 } else 2894 } else {
2742 return false;
2743}
2744EXPORT_SYMBOL_GPL(flush_work);
2745
2746static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2747{
2748 struct wq_barrier barr;
2749 struct worker *worker;
2750
2751 spin_lock_irq(&gcwq->lock);
2752
2753 worker = find_worker_executing_work(gcwq, work);
2754 if (unlikely(worker))
2755 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2756
2757 spin_unlock_irq(&gcwq->lock);
2758
2759 if (unlikely(worker)) {
2760 wait_for_completion(&barr.done);
2761 destroy_work_on_stack(&barr.work);
2762 return true;
2763 } else
2764 return false; 2895 return false;
2765}
2766
2767static bool wait_on_work(struct work_struct *work)
2768{
2769 bool ret = false;
2770 int cpu;
2771
2772 might_sleep();
2773
2774 lock_map_acquire(&work->lockdep_map);
2775 lock_map_release(&work->lockdep_map);
2776
2777 for_each_gcwq_cpu(cpu)
2778 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2779 return ret;
2780}
2781
2782/**
2783 * flush_work_sync - wait until a work has finished execution
2784 * @work: the work to flush
2785 *
2786 * Wait until @work has finished execution. On return, it's
2787 * guaranteed that all queueing instances of @work which happened
2788 * before this function is called are finished. In other words, if
2789 * @work hasn't been requeued since this function was called, @work is
2790 * guaranteed to be idle on return.
2791 *
2792 * RETURNS:
2793 * %true if flush_work_sync() waited for the work to finish execution,
2794 * %false if it was already idle.
2795 */
2796bool flush_work_sync(struct work_struct *work)
2797{
2798 struct wq_barrier barr;
2799 bool pending, waited;
2800
2801 /* we'll wait for executions separately, queue barr only if pending */
2802 pending = start_flush_work(work, &barr, false);
2803
2804 /* wait for executions to finish */
2805 waited = wait_on_work(work);
2806
2807 /* wait for the pending one */
2808 if (pending) {
2809 wait_for_completion(&barr.done);
2810 destroy_work_on_stack(&barr.work);
2811 }
2812
2813 return pending || waited;
2814}
2815EXPORT_SYMBOL_GPL(flush_work_sync);
2816
2817/*
2818 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2819 * so this work can't be re-armed in any way.
2820 */
2821static int try_to_grab_pending(struct work_struct *work)
2822{
2823 struct global_cwq *gcwq;
2824 int ret = -1;
2825
2826 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2827 return 0;
2828
2829 /*
2830 * The queueing is in progress, or it is already queued. Try to
2831 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2832 */
2833 gcwq = get_work_gcwq(work);
2834 if (!gcwq)
2835 return ret;
2836
2837 spin_lock_irq(&gcwq->lock);
2838 if (!list_empty(&work->entry)) {
2839 /*
2840 * This work is queued, but perhaps we locked the wrong gcwq.
2841 * In that case we must see the new value after rmb(), see
2842 * insert_work()->wmb().
2843 */
2844 smp_rmb();
2845 if (gcwq == get_work_gcwq(work)) {
2846 debug_work_deactivate(work);
2847 list_del_init(&work->entry);
2848 cwq_dec_nr_in_flight(get_work_cwq(work),
2849 get_work_color(work),
2850 *work_data_bits(work) & WORK_STRUCT_DELAYED);
2851 ret = 1;
2852 }
2853 } 2896 }
2854 spin_unlock_irq(&gcwq->lock);
2855
2856 return ret;
2857} 2897}
2898EXPORT_SYMBOL_GPL(flush_work);
2858 2899
2859static bool __cancel_work_timer(struct work_struct *work, 2900static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2860 struct timer_list* timer)
2861{ 2901{
2902 unsigned long flags;
2862 int ret; 2903 int ret;
2863 2904
2864 do { 2905 do {
2865 ret = (timer && likely(del_timer(timer))); 2906 ret = try_to_grab_pending(work, is_dwork, &flags);
2866 if (!ret) 2907 /*
2867 ret = try_to_grab_pending(work); 2908 * If someone else is canceling, wait for the same event it
2868 wait_on_work(work); 2909 * would be waiting for before retrying.
2910 */
2911 if (unlikely(ret == -ENOENT))
2912 flush_work(work);
2869 } while (unlikely(ret < 0)); 2913 } while (unlikely(ret < 0));
2870 2914
2915 /* tell other tasks trying to grab @work to back off */
2916 mark_work_canceling(work);
2917 local_irq_restore(flags);
2918
2919 flush_work(work);
2871 clear_work_data(work); 2920 clear_work_data(work);
2872 return ret; 2921 return ret;
2873} 2922}
@@ -2892,7 +2941,7 @@ static bool __cancel_work_timer(struct work_struct *work,
2892 */ 2941 */
2893bool cancel_work_sync(struct work_struct *work) 2942bool cancel_work_sync(struct work_struct *work)
2894{ 2943{
2895 return __cancel_work_timer(work, NULL); 2944 return __cancel_work_timer(work, false);
2896} 2945}
2897EXPORT_SYMBOL_GPL(cancel_work_sync); 2946EXPORT_SYMBOL_GPL(cancel_work_sync);
2898 2947
@@ -2910,33 +2959,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
2910 */ 2959 */
2911bool flush_delayed_work(struct delayed_work *dwork) 2960bool flush_delayed_work(struct delayed_work *dwork)
2912{ 2961{
2962 local_irq_disable();
2913 if (del_timer_sync(&dwork->timer)) 2963 if (del_timer_sync(&dwork->timer))
2914 __queue_work(raw_smp_processor_id(), 2964 __queue_work(dwork->cpu,
2915 get_work_cwq(&dwork->work)->wq, &dwork->work); 2965 get_work_cwq(&dwork->work)->wq, &dwork->work);
2966 local_irq_enable();
2916 return flush_work(&dwork->work); 2967 return flush_work(&dwork->work);
2917} 2968}
2918EXPORT_SYMBOL(flush_delayed_work); 2969EXPORT_SYMBOL(flush_delayed_work);
2919 2970
2920/** 2971/**
2921 * flush_delayed_work_sync - wait for a dwork to finish 2972 * cancel_delayed_work - cancel a delayed work
2922 * @dwork: the delayed work to flush 2973 * @dwork: delayed_work to cancel
2923 * 2974 *
2924 * Delayed timer is cancelled and the pending work is queued for 2975 * Kill off a pending delayed_work. Returns %true if @dwork was pending
2925 * execution immediately. Other than timer handling, its behavior 2976 * and canceled; %false if wasn't pending. Note that the work callback
2926 * is identical to flush_work_sync(). 2977 * function may still be running on return, unless it returns %true and the
2978 * work doesn't re-arm itself. Explicitly flush or use
2979 * cancel_delayed_work_sync() to wait on it.
2927 * 2980 *
2928 * RETURNS: 2981 * This function is safe to call from any context including IRQ handler.
2929 * %true if flush_work_sync() waited for the work to finish execution,
2930 * %false if it was already idle.
2931 */ 2982 */
2932bool flush_delayed_work_sync(struct delayed_work *dwork) 2983bool cancel_delayed_work(struct delayed_work *dwork)
2933{ 2984{
2934 if (del_timer_sync(&dwork->timer)) 2985 unsigned long flags;
2935 __queue_work(raw_smp_processor_id(), 2986 int ret;
2936 get_work_cwq(&dwork->work)->wq, &dwork->work); 2987
2937 return flush_work_sync(&dwork->work); 2988 do {
2989 ret = try_to_grab_pending(&dwork->work, true, &flags);
2990 } while (unlikely(ret == -EAGAIN));
2991
2992 if (unlikely(ret < 0))
2993 return false;
2994
2995 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
2996 local_irq_restore(flags);
2997 return ret;
2938} 2998}
2939EXPORT_SYMBOL(flush_delayed_work_sync); 2999EXPORT_SYMBOL(cancel_delayed_work);
2940 3000
2941/** 3001/**
2942 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 3002 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -2949,54 +3009,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
2949 */ 3009 */
2950bool cancel_delayed_work_sync(struct delayed_work *dwork) 3010bool cancel_delayed_work_sync(struct delayed_work *dwork)
2951{ 3011{
2952 return __cancel_work_timer(&dwork->work, &dwork->timer); 3012 return __cancel_work_timer(&dwork->work, true);
2953} 3013}
2954EXPORT_SYMBOL(cancel_delayed_work_sync); 3014EXPORT_SYMBOL(cancel_delayed_work_sync);
2955 3015
2956/** 3016/**
2957 * schedule_work - put work task in global workqueue
2958 * @work: job to be done
2959 *
2960 * Returns zero if @work was already on the kernel-global workqueue and
2961 * non-zero otherwise.
2962 *
2963 * This puts a job in the kernel-global workqueue if it was not already
2964 * queued and leaves it in the same position on the kernel-global
2965 * workqueue otherwise.
2966 */
2967int schedule_work(struct work_struct *work)
2968{
2969 return queue_work(system_wq, work);
2970}
2971EXPORT_SYMBOL(schedule_work);
2972
2973/*
2974 * schedule_work_on - put work task on a specific cpu 3017 * schedule_work_on - put work task on a specific cpu
2975 * @cpu: cpu to put the work task on 3018 * @cpu: cpu to put the work task on
2976 * @work: job to be done 3019 * @work: job to be done
2977 * 3020 *
2978 * This puts a job on a specific cpu 3021 * This puts a job on a specific cpu
2979 */ 3022 */
2980int schedule_work_on(int cpu, struct work_struct *work) 3023bool schedule_work_on(int cpu, struct work_struct *work)
2981{ 3024{
2982 return queue_work_on(cpu, system_wq, work); 3025 return queue_work_on(cpu, system_wq, work);
2983} 3026}
2984EXPORT_SYMBOL(schedule_work_on); 3027EXPORT_SYMBOL(schedule_work_on);
2985 3028
2986/** 3029/**
2987 * schedule_delayed_work - put work task in global workqueue after delay 3030 * schedule_work - put work task in global workqueue
2988 * @dwork: job to be done 3031 * @work: job to be done
2989 * @delay: number of jiffies to wait or 0 for immediate execution
2990 * 3032 *
2991 * After waiting for a given time this puts a job in the kernel-global 3033 * Returns %false if @work was already on the kernel-global workqueue and
2992 * workqueue. 3034 * %true otherwise.
3035 *
3036 * This puts a job in the kernel-global workqueue if it was not already
3037 * queued and leaves it in the same position on the kernel-global
3038 * workqueue otherwise.
2993 */ 3039 */
2994int schedule_delayed_work(struct delayed_work *dwork, 3040bool schedule_work(struct work_struct *work)
2995 unsigned long delay)
2996{ 3041{
2997 return queue_delayed_work(system_wq, dwork, delay); 3042 return queue_work(system_wq, work);
2998} 3043}
2999EXPORT_SYMBOL(schedule_delayed_work); 3044EXPORT_SYMBOL(schedule_work);
3000 3045
3001/** 3046/**
3002 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 3047 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
@@ -3007,14 +3052,28 @@ EXPORT_SYMBOL(schedule_delayed_work);
3007 * After waiting for a given time this puts a job in the kernel-global 3052 * After waiting for a given time this puts a job in the kernel-global
3008 * workqueue on the specified CPU. 3053 * workqueue on the specified CPU.
3009 */ 3054 */
3010int schedule_delayed_work_on(int cpu, 3055bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
3011 struct delayed_work *dwork, unsigned long delay) 3056 unsigned long delay)
3012{ 3057{
3013 return queue_delayed_work_on(cpu, system_wq, dwork, delay); 3058 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
3014} 3059}
3015EXPORT_SYMBOL(schedule_delayed_work_on); 3060EXPORT_SYMBOL(schedule_delayed_work_on);
3016 3061
3017/** 3062/**
3063 * schedule_delayed_work - put work task in global workqueue after delay
3064 * @dwork: job to be done
3065 * @delay: number of jiffies to wait or 0 for immediate execution
3066 *
3067 * After waiting for a given time this puts a job in the kernel-global
3068 * workqueue.
3069 */
3070bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
3071{
3072 return queue_delayed_work(system_wq, dwork, delay);
3073}
3074EXPORT_SYMBOL(schedule_delayed_work);
3075
3076/**
3018 * schedule_on_each_cpu - execute a function synchronously on each online CPU 3077 * schedule_on_each_cpu - execute a function synchronously on each online CPU
3019 * @func: the function to call 3078 * @func: the function to call
3020 * 3079 *
@@ -3161,9 +3220,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
3161 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; 3220 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3162 3221
3163 if (max_active < 1 || max_active > lim) 3222 if (max_active < 1 || max_active > lim)
3164 printk(KERN_WARNING "workqueue: max_active %d requested for %s " 3223 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3165 "is out of range, clamping between %d and %d\n", 3224 max_active, name, 1, lim);
3166 max_active, name, 1, lim);
3167 3225
3168 return clamp_val(max_active, 1, lim); 3226 return clamp_val(max_active, 1, lim);
3169} 3227}
@@ -3319,6 +3377,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
3319EXPORT_SYMBOL_GPL(destroy_workqueue); 3377EXPORT_SYMBOL_GPL(destroy_workqueue);
3320 3378
3321/** 3379/**
3380 * cwq_set_max_active - adjust max_active of a cwq
3381 * @cwq: target cpu_workqueue_struct
3382 * @max_active: new max_active value.
3383 *
3384 * Set @cwq->max_active to @max_active and activate delayed works if
3385 * increased.
3386 *
3387 * CONTEXT:
3388 * spin_lock_irq(gcwq->lock).
3389 */
3390static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
3391{
3392 cwq->max_active = max_active;
3393
3394 while (!list_empty(&cwq->delayed_works) &&
3395 cwq->nr_active < cwq->max_active)
3396 cwq_activate_first_delayed(cwq);
3397}
3398
3399/**
3322 * workqueue_set_max_active - adjust max_active of a workqueue 3400 * workqueue_set_max_active - adjust max_active of a workqueue
3323 * @wq: target workqueue 3401 * @wq: target workqueue
3324 * @max_active: new max_active value. 3402 * @max_active: new max_active value.
@@ -3345,7 +3423,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3345 3423
3346 if (!(wq->flags & WQ_FREEZABLE) || 3424 if (!(wq->flags & WQ_FREEZABLE) ||
3347 !(gcwq->flags & GCWQ_FREEZING)) 3425 !(gcwq->flags & GCWQ_FREEZING))
3348 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3426 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
3349 3427
3350 spin_unlock_irq(&gcwq->lock); 3428 spin_unlock_irq(&gcwq->lock);
3351 } 3429 }
@@ -3409,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work)
3409 unsigned int ret = 0; 3487 unsigned int ret = 0;
3410 3488
3411 if (!gcwq) 3489 if (!gcwq)
3412 return false; 3490 return 0;
3413 3491
3414 spin_lock_irqsave(&gcwq->lock, flags); 3492 spin_lock_irqsave(&gcwq->lock, flags);
3415 3493
@@ -3440,23 +3518,23 @@ EXPORT_SYMBOL_GPL(work_busy);
3440 */ 3518 */
3441 3519
3442/* claim manager positions of all pools */ 3520/* claim manager positions of all pools */
3443static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) 3521static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3444{ 3522{
3445 struct worker_pool *pool; 3523 struct worker_pool *pool;
3446 3524
3447 for_each_worker_pool(pool, gcwq) 3525 for_each_worker_pool(pool, gcwq)
3448 mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); 3526 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3449 spin_lock_irq(&gcwq->lock); 3527 spin_lock_irq(&gcwq->lock);
3450} 3528}
3451 3529
3452/* release manager positions */ 3530/* release manager positions */
3453static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) 3531static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3454{ 3532{
3455 struct worker_pool *pool; 3533 struct worker_pool *pool;
3456 3534
3457 spin_unlock_irq(&gcwq->lock); 3535 spin_unlock_irq(&gcwq->lock);
3458 for_each_worker_pool(pool, gcwq) 3536 for_each_worker_pool(pool, gcwq)
3459 mutex_unlock(&pool->manager_mutex); 3537 mutex_unlock(&pool->assoc_mutex);
3460} 3538}
3461 3539
3462static void gcwq_unbind_fn(struct work_struct *work) 3540static void gcwq_unbind_fn(struct work_struct *work)
@@ -3469,7 +3547,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3469 3547
3470 BUG_ON(gcwq->cpu != smp_processor_id()); 3548 BUG_ON(gcwq->cpu != smp_processor_id());
3471 3549
3472 gcwq_claim_management_and_lock(gcwq); 3550 gcwq_claim_assoc_and_lock(gcwq);
3473 3551
3474 /* 3552 /*
3475 * We've claimed all manager positions. Make all workers unbound 3553 * We've claimed all manager positions. Make all workers unbound
@@ -3486,7 +3564,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3486 3564
3487 gcwq->flags |= GCWQ_DISASSOCIATED; 3565 gcwq->flags |= GCWQ_DISASSOCIATED;
3488 3566
3489 gcwq_release_management_and_unlock(gcwq); 3567 gcwq_release_assoc_and_unlock(gcwq);
3490 3568
3491 /* 3569 /*
3492 * Call schedule() so that we cross rq->lock and thus can guarantee 3570 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3514,7 +3592,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3514 * Workqueues should be brought up before normal priority CPU notifiers. 3592 * Workqueues should be brought up before normal priority CPU notifiers.
3515 * This will be registered high priority CPU notifier. 3593 * This will be registered high priority CPU notifier.
3516 */ 3594 */
3517static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, 3595static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3518 unsigned long action, 3596 unsigned long action,
3519 void *hcpu) 3597 void *hcpu)
3520{ 3598{
@@ -3542,10 +3620,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3542 3620
3543 case CPU_DOWN_FAILED: 3621 case CPU_DOWN_FAILED:
3544 case CPU_ONLINE: 3622 case CPU_ONLINE:
3545 gcwq_claim_management_and_lock(gcwq); 3623 gcwq_claim_assoc_and_lock(gcwq);
3546 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3624 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3547 rebind_workers(gcwq); 3625 rebind_workers(gcwq);
3548 gcwq_release_management_and_unlock(gcwq); 3626 gcwq_release_assoc_and_unlock(gcwq);
3549 break; 3627 break;
3550 } 3628 }
3551 return NOTIFY_OK; 3629 return NOTIFY_OK;
@@ -3555,7 +3633,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3555 * Workqueues should be brought down after normal priority CPU notifiers. 3633 * Workqueues should be brought down after normal priority CPU notifiers.
3556 * This will be registered as low priority CPU notifier. 3634 * This will be registered as low priority CPU notifier.
3557 */ 3635 */
3558static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, 3636static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3559 unsigned long action, 3637 unsigned long action,
3560 void *hcpu) 3638 void *hcpu)
3561{ 3639{
@@ -3566,7 +3644,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3566 case CPU_DOWN_PREPARE: 3644 case CPU_DOWN_PREPARE:
3567 /* unbinding should happen on the local CPU */ 3645 /* unbinding should happen on the local CPU */
3568 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3646 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3569 schedule_work_on(cpu, &unbind_work); 3647 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3570 flush_work(&unbind_work); 3648 flush_work(&unbind_work);
3571 break; 3649 break;
3572 } 3650 }
@@ -3735,11 +3813,7 @@ void thaw_workqueues(void)
3735 continue; 3813 continue;
3736 3814
3737 /* restore max_active and repopulate worklist */ 3815 /* restore max_active and repopulate worklist */
3738 cwq->max_active = wq->saved_max_active; 3816 cwq_set_max_active(cwq, wq->saved_max_active);
3739
3740 while (!list_empty(&cwq->delayed_works) &&
3741 cwq->nr_active < cwq->max_active)
3742 cwq_activate_first_delayed(cwq);
3743 } 3817 }
3744 3818
3745 for_each_worker_pool(pool, gcwq) 3819 for_each_worker_pool(pool, gcwq)
@@ -3759,8 +3833,12 @@ static int __init init_workqueues(void)
3759 unsigned int cpu; 3833 unsigned int cpu;
3760 int i; 3834 int i;
3761 3835
3836 /* make sure we have enough bits for OFFQ CPU number */
3837 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
3838 WORK_CPU_LAST);
3839
3762 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3840 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3763 cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3841 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3764 3842
3765 /* initialize gcwqs */ 3843 /* initialize gcwqs */
3766 for_each_gcwq_cpu(cpu) { 3844 for_each_gcwq_cpu(cpu) {
@@ -3786,11 +3864,9 @@ static int __init init_workqueues(void)
3786 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3864 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3787 (unsigned long)pool); 3865 (unsigned long)pool);
3788 3866
3789 mutex_init(&pool->manager_mutex); 3867 mutex_init(&pool->assoc_mutex);
3790 ida_init(&pool->worker_ida); 3868 ida_init(&pool->worker_ida);
3791 } 3869 }
3792
3793 init_waitqueue_head(&gcwq->rebind_hold);
3794 } 3870 }
3795 3871
3796 /* create the initial worker */ 3872 /* create the initial worker */
@@ -3813,17 +3889,14 @@ static int __init init_workqueues(void)
3813 } 3889 }
3814 3890
3815 system_wq = alloc_workqueue("events", 0, 0); 3891 system_wq = alloc_workqueue("events", 0, 0);
3892 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3816 system_long_wq = alloc_workqueue("events_long", 0, 0); 3893 system_long_wq = alloc_workqueue("events_long", 0, 0);
3817 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3818 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3894 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3819 WQ_UNBOUND_MAX_ACTIVE); 3895 WQ_UNBOUND_MAX_ACTIVE);
3820 system_freezable_wq = alloc_workqueue("events_freezable", 3896 system_freezable_wq = alloc_workqueue("events_freezable",
3821 WQ_FREEZABLE, 0); 3897 WQ_FREEZABLE, 0);
3822 system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", 3898 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
3823 WQ_NON_REENTRANT | WQ_FREEZABLE, 0); 3899 !system_unbound_wq || !system_freezable_wq);
3824 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3825 !system_unbound_wq || !system_freezable_wq ||
3826 !system_nrt_freezable_wq);
3827 return 0; 3900 return 0;
3828} 3901}
3829early_initcall(init_workqueues); 3902early_initcall(init_workqueues);