aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /kernel
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks103
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile112
-rw-r--r--kernel/acct.c56
-rw-r--r--kernel/async.c84
-rw-r--r--kernel/audit.c248
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/audit_tree.c65
-rw-r--r--kernel/audit_watch.c36
-rw-r--r--kernel/auditfilter.c216
-rw-r--r--kernel/auditsc.c1186
-rw-r--r--kernel/capability.c103
-rw-r--r--kernel/cgroup.c2290
-rw-r--r--kernel/cgroup_freezer.c559
-rw-r--r--kernel/compat.c159
-rw-r--r--kernel/context_tracking.c83
-rw-r--r--kernel/cpu.c127
-rw-r--r--kernel/cpu_pm.c233
-rw-r--r--kernel/cpuset.c482
-rw-r--r--kernel/crash_dump.c13
-rw-r--r--kernel/cred.c204
-rw-r--r--kernel/debug/debug_core.c119
-rw-r--r--kernel/debug/gdbstub.c22
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_bt.c3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_io.c48
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c142
-rw-r--r--kernel/debug/kdb/kdb_private.h8
-rw-r--r--kernel/debug/kdb/kdb_support.c6
-rw-r--r--kernel/dma.c3
-rw-r--r--kernel/events/Makefile5
-rw-r--r--kernel/events/callchain.c206
-rw-r--r--kernel/events/core.c1201
-rw-r--r--kernel/events/hw_breakpoint.c40
-rw-r--r--kernel/events/internal.h125
-rw-r--r--kernel/events/ring_buffer.c15
-rw-r--r--kernel/events/uprobes.c1627
-rw-r--r--kernel/exit.c329
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c521
-rw-r--r--kernel/freezer.c216
-rw-r--r--kernel/futex.c177
-rw-r--r--kernel/futex_compat.c38
-rw-r--r--kernel/gcov/Kconfig8
-rw-r--r--kernel/gcov/gcc_3_4.c88
-rw-r--r--kernel/gcov/gcov.h42
-rw-r--r--kernel/groups.c52
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/hung_task.c17
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c126
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/handle.c35
-rw-r--r--kernel/irq/internals.h28
-rw-r--r--kernel/irq/irqdesc.c35
-rw-r--r--kernel/irq/irqdomain.c953
-rw-r--r--kernel/irq/manage.c559
-rw-r--r--kernel/irq/migration.c9
-rw-r--r--kernel/irq/pm.c12
-rw-r--r--kernel/irq/resend.c30
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/irq_work.c97
-rw-r--r--kernel/itimer.c23
-rw-r--r--kernel/jump_label.c166
-rw-r--r--kernel/kallsyms.c32
-rw-r--r--kernel/kcmp.c197
-rw-r--r--kernel/kexec.c86
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kmod.c286
-rw-r--r--kernel/kprobes.c297
-rw-r--r--kernel/ksysfs.c26
-rw-r--r--kernel/kthread.c305
-rw-r--r--kernel/latencytop.c16
-rw-r--r--kernel/lglock.c89
-rw-r--r--kernel/lockdep.c370
-rw-r--r--kernel/lockdep_proc.c4
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c104
-rw-r--r--kernel/module-internal.h14
-rw-r--r--kernel/module.c855
-rw-r--r--kernel/module_signing.c249
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c38
-rw-r--r--kernel/padata.c64
-rw-r--r--kernel/panic.c72
-rw-r--r--kernel/params.c152
-rw-r--r--kernel/pid.c93
-rw-r--r--kernel/pid_namespace.c231
-rw-r--r--kernel/posix-cpu-timers.c169
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig115
-rw-r--r--kernel/power/Makefile11
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/console.c4
-rw-r--r--kernel/power/hibernate.c269
-rw-r--r--kernel/power/main.c333
-rw-r--r--kernel/power/power.h73
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c181
-rw-r--r--kernel/power/qos.c602
-rw-r--r--kernel/power/snapshot.c62
-rw-r--r--kernel/power/suspend.c98
-rw-r--r--kernel/power/swap.c938
-rw-r--r--kernel/power/user.c194
-rw-r--r--kernel/power/wakelock.c764
-rw-r--r--kernel/printk.c2107
-rw-r--r--kernel/profile.c9
-rw-r--r--kernel/ptrace.c107
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcu.h114
-rw-r--r--kernel/rcupdate.c124
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutiny_plugin.h288
-rw-r--r--kernel/rcutorture.c789
-rw-r--r--kernel/rcutree.c2213
-rw-r--r--kernel/rcutree.h251
-rw-r--r--kernel/rcutree_plugin.h1969
-rw-r--r--kernel/rcutree_trace.c399
-rw-r--r--kernel/relay.c19
-rw-r--r--kernel/res_counter.c99
-rw-r--r--kernel/resource.c96
-rw-r--r--kernel/rtmutex-debug.c80
-rw-r--r--kernel/rtmutex-tester.c39
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c13
-rw-r--r--kernel/sched/Makefile18
-rw-r--r--kernel/sched/auto_group.c258
-rw-r--r--kernel/sched/auto_group.h64
-rw-r--r--kernel/sched/clock.c350
-rw-r--r--kernel/sched/core.c8162
-rw-r--r--kernel/sched/cpupri.c240
-rw-r--r--kernel/sched/cpupri.h34
-rw-r--r--kernel/sched/cputime.c589
-rw-r--r--kernel/sched/debug.c531
-rw-r--r--kernel/sched/fair.c6174
-rw-r--r--kernel/sched/features.h79
-rw-r--r--kernel/sched/idle_task.c98
-rw-r--r--kernel/sched/rt.c2094
-rw-r--r--kernel/sched/sched.h1241
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h231
-rw-r--r--kernel/sched/stop_task.c128
-rw-r--r--kernel/seccomp.c465
-rw-r--r--kernel/semaphore.c32
-rw-r--r--kernel/signal.c356
-rw-r--r--kernel/smp.c139
-rw-r--r--kernel/smpboot.c300
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c162
-rw-r--r--kernel/spinlock.c4
-rw-r--r--kernel/srcu.c561
-rw-r--r--kernel/stacktrace.c2
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys.c603
-rw-r--r--kernel/sys_ni.c8
-rw-r--r--kernel/sysctl.c701
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/task_work.c92
-rw-r--r--kernel/taskstats.c44
-rw-r--r--kernel/time.c10
-rw-r--r--kernel/time/Kconfig64
-rw-r--r--kernel/time/Makefile4
-rw-r--r--kernel/time/alarmtimer.c342
-rw-r--r--kernel/time/clockevents.c157
-rw-r--r--kernel/time/clocksource.c89
-rw-r--r--kernel/time/jiffies.c40
-rw-r--r--kernel/time/ntp.c201
-rw-r--r--kernel/time/posix-clock.c1
-rw-r--r--kernel/time/tick-broadcast.c13
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-oneshot.c77
-rw-r--r--kernel/time/tick-sched.c519
-rw-r--r--kernel/time/timekeeping.c963
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/timer.c321
-rw-r--r--kernel/trace/Kconfig67
-rw-r--r--kernel/trace/Makefile18
-rw-r--r--kernel/trace/blktrace.c21
-rw-r--r--kernel/trace/ftrace.c1292
-rw-r--r--kernel/trace/ring_buffer.c934
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c1342
-rw-r--r--kernel/trace/trace.h93
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_clock.c12
-rw-r--r--kernel/trace/trace_entries.h70
-rw-r--r--kernel/trace/trace_event_perf.c209
-rw-r--r--kernel/trace/trace_events.c184
-rw-r--r--kernel/trace/trace_events_filter.c1172
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_export.c67
-rw-r--r--kernel/trace/trace_functions.c44
-rw-r--r--kernel/trace/trace_functions_graph.c13
-rw-r--r--kernel/trace/trace_irqsoff.c44
-rw-r--r--kernel/trace/trace_kprobe.c923
-rw-r--r--kernel/trace/trace_output.c111
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c839
-rw-r--r--kernel/trace/trace_probe.h161
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c30
-rw-r--r--kernel/trace/trace_selftest.c305
-rw-r--r--kernel/trace/trace_stack.c38
-rw-r--r--kernel/trace/trace_syscalls.c94
-rw-r--r--kernel/trace/trace_uprobe.c788
-rw-r--r--kernel/tracepoint.c190
-rw-r--r--kernel/tsacct.c14
-rw-r--r--kernel/uid16.c48
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c63
-rw-r--r--kernel/user_namespace.c834
-rw-r--r--kernel/utsname.c36
-rw-r--r--kernel/utsname_sysctl.c25
-rw-r--r--kernel/wait.c8
-rw-r--r--kernel/watchdog.c329
-rw-r--r--kernel/workqueue.c2188
227 files changed, 15804 insertions, 53963 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100ea..5068e2a4e75 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,9 +87,6 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 87config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
88 bool 88 bool
89 89
90config UNINLINE_SPIN_UNLOCK
91 bool
92
93# 90#
94# lock_* functions are inlined when: 91# lock_* functions are inlined when:
95# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y 92# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -106,120 +103,100 @@ config UNINLINE_SPIN_UNLOCK
106# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y 103# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
107# 104#
108 105
109if !DEBUG_SPINLOCK
110
111config INLINE_SPIN_TRYLOCK 106config INLINE_SPIN_TRYLOCK
112 def_bool y 107 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
113 depends on ARCH_INLINE_SPIN_TRYLOCK
114 108
115config INLINE_SPIN_TRYLOCK_BH 109config INLINE_SPIN_TRYLOCK_BH
116 def_bool y 110 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
117 depends on ARCH_INLINE_SPIN_TRYLOCK_BH
118 111
119config INLINE_SPIN_LOCK 112config INLINE_SPIN_LOCK
120 def_bool y 113 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
121 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
122 114
123config INLINE_SPIN_LOCK_BH 115config INLINE_SPIN_LOCK_BH
124 def_bool y 116 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH 117 ARCH_INLINE_SPIN_LOCK_BH
126 118
127config INLINE_SPIN_LOCK_IRQ 119config INLINE_SPIN_LOCK_IRQ
128 def_bool y 120 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
129 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ 121 ARCH_INLINE_SPIN_LOCK_IRQ
130 122
131config INLINE_SPIN_LOCK_IRQSAVE 123config INLINE_SPIN_LOCK_IRQSAVE
132 def_bool y 124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
133 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE 125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126
127config INLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
134 129
135config INLINE_SPIN_UNLOCK_BH 130config INLINE_SPIN_UNLOCK_BH
136 def_bool y 131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
137 depends on ARCH_INLINE_SPIN_UNLOCK_BH
138 132
139config INLINE_SPIN_UNLOCK_IRQ 133config INLINE_SPIN_UNLOCK_IRQ
140 def_bool y 134 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH
142 135
143config INLINE_SPIN_UNLOCK_IRQRESTORE 136config INLINE_SPIN_UNLOCK_IRQRESTORE
144 def_bool y 137 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
145 depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
146 138
147 139
148config INLINE_READ_TRYLOCK 140config INLINE_READ_TRYLOCK
149 def_bool y 141 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
150 depends on ARCH_INLINE_READ_TRYLOCK
151 142
152config INLINE_READ_LOCK 143config INLINE_READ_LOCK
153 def_bool y 144 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
154 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
155 145
156config INLINE_READ_LOCK_BH 146config INLINE_READ_LOCK_BH
157 def_bool y 147 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
158 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH 148 ARCH_INLINE_READ_LOCK_BH
159 149
160config INLINE_READ_LOCK_IRQ 150config INLINE_READ_LOCK_IRQ
161 def_bool y 151 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
162 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ 152 ARCH_INLINE_READ_LOCK_IRQ
163 153
164config INLINE_READ_LOCK_IRQSAVE 154config INLINE_READ_LOCK_IRQSAVE
165 def_bool y 155 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
166 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE 156 ARCH_INLINE_READ_LOCK_IRQSAVE
167 157
168config INLINE_READ_UNLOCK 158config INLINE_READ_UNLOCK
169 def_bool y 159 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
170 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
171 160
172config INLINE_READ_UNLOCK_BH 161config INLINE_READ_UNLOCK_BH
173 def_bool y 162 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
174 depends on ARCH_INLINE_READ_UNLOCK_BH
175 163
176config INLINE_READ_UNLOCK_IRQ 164config INLINE_READ_UNLOCK_IRQ
177 def_bool y 165 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH
179 166
180config INLINE_READ_UNLOCK_IRQRESTORE 167config INLINE_READ_UNLOCK_IRQRESTORE
181 def_bool y 168 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
182 depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
183 169
184 170
185config INLINE_WRITE_TRYLOCK 171config INLINE_WRITE_TRYLOCK
186 def_bool y 172 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
187 depends on ARCH_INLINE_WRITE_TRYLOCK
188 173
189config INLINE_WRITE_LOCK 174config INLINE_WRITE_LOCK
190 def_bool y 175 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
191 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
192 176
193config INLINE_WRITE_LOCK_BH 177config INLINE_WRITE_LOCK_BH
194 def_bool y 178 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
195 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH 179 ARCH_INLINE_WRITE_LOCK_BH
196 180
197config INLINE_WRITE_LOCK_IRQ 181config INLINE_WRITE_LOCK_IRQ
198 def_bool y 182 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
199 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ 183 ARCH_INLINE_WRITE_LOCK_IRQ
200 184
201config INLINE_WRITE_LOCK_IRQSAVE 185config INLINE_WRITE_LOCK_IRQSAVE
202 def_bool y 186 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
203 depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE 187 ARCH_INLINE_WRITE_LOCK_IRQSAVE
204 188
205config INLINE_WRITE_UNLOCK 189config INLINE_WRITE_UNLOCK
206 def_bool y 190 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
207 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
208 191
209config INLINE_WRITE_UNLOCK_BH 192config INLINE_WRITE_UNLOCK_BH
210 def_bool y 193 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
211 depends on ARCH_INLINE_WRITE_UNLOCK_BH
212 194
213config INLINE_WRITE_UNLOCK_IRQ 195config INLINE_WRITE_UNLOCK_IRQ
214 def_bool y 196 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH
216 197
217config INLINE_WRITE_UNLOCK_IRQRESTORE 198config INLINE_WRITE_UNLOCK_IRQRESTORE
218 def_bool y 199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
219 depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
220
221endif
222 200
223config MUTEX_SPIN_ON_OWNER 201config MUTEX_SPIN_ON_OWNER
224 def_bool y 202 def_bool SMP && !DEBUG_MUTEXES
225 depends on SMP && !DEBUG_MUTEXES
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f0..24e7cb0ba26 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,7 +36,6 @@ config PREEMPT_VOLUNTARY
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT 38 select PREEMPT_COUNT
39 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
40 help 39 help
41 This option reduces the latency of the kernel by making 40 This option reduces the latency of the kernel by making
42 all kernel code (that is not executing in a critical section) 41 all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da23..eca595e2fd5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,15 +2,16 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = fork.o exec_domain.o panic.o printk.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
@@ -19,17 +20,13 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
19CFLAGS_REMOVE_mutex-debug.o = -pg 20CFLAGS_REMOVE_mutex-debug.o = -pg
20CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
21CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
23endif 25endif
24 26
25obj-y += sched/
26obj-y += power/
27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
29obj-$(CONFIG_X86) += kcmp.o
30endif
31obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
34obj-y += time/ 31obj-y += time/
35obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 32obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -54,8 +51,9 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 51obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 52obj-$(CONFIG_UID16) += uid16.o
56obj-$(CONFIG_MODULES) += module.o 53obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 54obj-$(CONFIG_KALLSYMS) += kallsyms.o
55obj-$(CONFIG_PM) += power/
56obj-$(CONFIG_FREEZER) += power/
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 57obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 58obj-$(CONFIG_KEXEC) += kexec.o
61obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -98,11 +96,11 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
98obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o 96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
99obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_TRACE_CLOCK) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_CPU_PM) += cpu_pm.o
106 104
107obj-$(CONFIG_PERF_EVENTS) += events/ 105obj-$(CONFIG_PERF_EVENTS) += events/
108 106
@@ -110,7 +108,15 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
113obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o 111
112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
114# needed for x86 only. Why this used to be enabled for all architectures is beyond
115# me. I suspect most platforms don't need this, but until we know that for sure
116# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
117# to get a correct value for the wait-channel (WCHAN in ps). --davidm
118CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
119endif
114 120
115$(obj)/configs.o: $(obj)/config_data.h 121$(obj)/configs.o: $(obj)/config_data.h
116 122
@@ -132,81 +138,3 @@ quiet_cmd_timeconst = TIMEC $@
132targets += timeconst.h 138targets += timeconst.h
133$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 139$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
134 $(call if_changed,timeconst) 140 $(call if_changed,timeconst)
135
136ifeq ($(CONFIG_MODULE_SIG),y)
137#
138# Pull the signing certificate and any extra certificates into the kernel
139#
140
141quiet_cmd_touch = TOUCH $@
142 cmd_touch = touch $@
143
144extra_certificates:
145 $(call cmd,touch)
146
147kernel/modsign_certificate.o: signing_key.x509 extra_certificates
148
149###############################################################################
150#
151# If module signing is requested, say by allyesconfig, but a key has not been
152# supplied, then one will need to be generated to make sure the build does not
153# fail and that the kernel may be used afterwards.
154#
155###############################################################################
156sign_key_with_hash :=
157ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
158sign_key_with_hash := -sha1
159endif
160ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
161sign_key_with_hash := -sha224
162endif
163ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
164sign_key_with_hash := -sha256
165endif
166ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
167sign_key_with_hash := -sha384
168endif
169ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
170sign_key_with_hash := -sha512
171endif
172ifeq ($(sign_key_with_hash),)
173$(error Could not determine digest type to use from kernel config)
174endif
175
176signing_key.priv signing_key.x509: x509.genkey
177 @echo "###"
178 @echo "### Now generating an X.509 key pair to be used for signing modules."
179 @echo "###"
180 @echo "### If this takes a long time, you might wish to run rngd in the"
181 @echo "### background to keep the supply of entropy topped up. It"
182 @echo "### needs to be run as root, and uses a hardware random"
183 @echo "### number generator if one is available."
184 @echo "###"
185 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
186 -x509 -config x509.genkey \
187 -outform DER -out signing_key.x509 \
188 -keyout signing_key.priv
189 @echo "###"
190 @echo "### Key pair generated."
191 @echo "###"
192
193x509.genkey:
194 @echo Generating X.509 key generation config
195 @echo >x509.genkey "[ req ]"
196 @echo >>x509.genkey "default_bits = 4096"
197 @echo >>x509.genkey "distinguished_name = req_distinguished_name"
198 @echo >>x509.genkey "prompt = no"
199 @echo >>x509.genkey "string_mask = utf8only"
200 @echo >>x509.genkey "x509_extensions = myexts"
201 @echo >>x509.genkey
202 @echo >>x509.genkey "[ req_distinguished_name ]"
203 @echo >>x509.genkey "O = Magrathea"
204 @echo >>x509.genkey "CN = Glacier signing key"
205 @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
206 @echo >>x509.genkey
207 @echo >>x509.genkey "[ myexts ]"
208 @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
209 @echo >>x509.genkey "keyUsage=digitalSignature"
210 @echo >>x509.genkey "subjectKeyIdentifier=hash"
211 @echo >>x509.genkey "authorityKeyIdentifier=keyid"
212endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e..fa7eb3de2dd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,10 +84,11 @@ static void do_acct_process(struct bsd_acct_struct *acct,
84 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
85 */ 85 */
86struct bsd_acct_struct { 86struct bsd_acct_struct {
87 int active; 87 volatile int active;
88 unsigned long needcheck; 88 volatile int needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer;
91 struct list_head list; 92 struct list_head list;
92}; 93};
93 94
@@ -95,6 +96,15 @@ static DEFINE_SPINLOCK(acct_lock);
95static LIST_HEAD(acct_list); 96static LIST_HEAD(acct_list);
96 97
97/* 98/*
99 * Called whenever the timer says to check the free space.
100 */
101static void acct_timeout(unsigned long x)
102{
103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
105}
106
107/*
98 * Check the amount of free space and suspend/resume accordingly. 108 * Check the amount of free space and suspend/resume accordingly.
99 */ 109 */
100static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 110static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -102,12 +112,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
102 struct kstatfs sbuf; 112 struct kstatfs sbuf;
103 int res; 113 int res;
104 int act; 114 int act;
105 u64 resume; 115 sector_t resume;
106 u64 suspend; 116 sector_t suspend;
107 117
108 spin_lock(&acct_lock); 118 spin_lock(&acct_lock);
109 res = acct->active; 119 res = acct->active;
110 if (!file || time_is_before_jiffies(acct->needcheck)) 120 if (!file || !acct->needcheck)
111 goto out; 121 goto out;
112 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
113 123
@@ -117,8 +127,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
117 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
118 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
119 129
120 do_div(suspend, 100); 130 sector_div(suspend, 100);
121 do_div(resume, 100); 131 sector_div(resume, 100);
122 132
123 if (sbuf.f_bavail <= suspend) 133 if (sbuf.f_bavail <= suspend)
124 act = -1; 134 act = -1;
@@ -150,7 +160,10 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
150 } 160 }
151 } 161 }
152 162
153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 163 del_timer(&acct->timer);
164 acct->needcheck = 0;
165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
166 add_timer(&acct->timer);
154 res = acct->active; 167 res = acct->active;
155out: 168out:
156 spin_unlock(&acct_lock); 169 spin_unlock(&acct_lock);
@@ -172,7 +185,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
172 if (acct->file) { 185 if (acct->file) {
173 old_acct = acct->file; 186 old_acct = acct->file;
174 old_ns = acct->ns; 187 old_ns = acct->ns;
188 del_timer(&acct->timer);
175 acct->active = 0; 189 acct->active = 0;
190 acct->needcheck = 0;
176 acct->file = NULL; 191 acct->file = NULL;
177 acct->ns = NULL; 192 acct->ns = NULL;
178 list_del(&acct->list); 193 list_del(&acct->list);
@@ -180,9 +195,13 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
180 if (file) { 195 if (file) {
181 acct->file = file; 196 acct->file = file;
182 acct->ns = ns; 197 acct->ns = ns;
183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 198 acct->needcheck = 0;
184 acct->active = 1; 199 acct->active = 1;
185 list_add(&acct->list, &acct_list); 200 list_add(&acct->list, &acct_list);
201 /* It's been deleted if it was used before so this is safe */
202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
204 add_timer(&acct->timer);
186 } 205 }
187 if (old_acct) { 206 if (old_acct) {
188 mnt_unpin(old_acct->f_path.mnt); 207 mnt_unpin(old_acct->f_path.mnt);
@@ -193,7 +212,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
193 } 212 }
194} 213}
195 214
196static int acct_on(struct filename *pathname) 215static int acct_on(char *name)
197{ 216{
198 struct file *file; 217 struct file *file;
199 struct vfsmount *mnt; 218 struct vfsmount *mnt;
@@ -201,7 +220,7 @@ static int acct_on(struct filename *pathname)
201 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
202 221
203 /* Difference from BSD - they don't do O_APPEND */ 222 /* Difference from BSD - they don't do O_APPEND */
204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 223 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 224 if (IS_ERR(file))
206 return PTR_ERR(file); 225 return PTR_ERR(file);
207 226
@@ -260,7 +279,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
260 return -EPERM; 279 return -EPERM;
261 280
262 if (name) { 281 if (name) {
263 struct filename *tmp = getname(name); 282 char *tmp = getname(name);
264 if (IS_ERR(tmp)) 283 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 284 return (PTR_ERR(tmp));
266 error = acct_on(tmp); 285 error = acct_on(tmp);
@@ -315,7 +334,7 @@ void acct_auto_close(struct super_block *sb)
315 spin_lock(&acct_lock); 334 spin_lock(&acct_lock);
316restart: 335restart:
317 list_for_each_entry(acct, &acct_list, list) 336 list_for_each_entry(acct, &acct_list, list)
318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) { 337 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
319 acct_file_reopen(acct, NULL, NULL); 338 acct_file_reopen(acct, NULL, NULL);
320 goto restart; 339 goto restart;
321 } 340 }
@@ -329,6 +348,7 @@ void acct_exit_ns(struct pid_namespace *ns)
329 if (acct == NULL) 348 if (acct == NULL)
330 return; 349 return;
331 350
351 del_timer_sync(&acct->timer);
332 spin_lock(&acct_lock); 352 spin_lock(&acct_lock);
333 if (acct->file != NULL) 353 if (acct->file != NULL)
334 acct_file_reopen(acct, NULL, NULL); 354 acct_file_reopen(acct, NULL, NULL);
@@ -478,7 +498,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
478 * Fill the accounting struct with the needed info as recorded 498 * Fill the accounting struct with the needed info as recorded
479 * by the different kernel functions. 499 * by the different kernel functions.
480 */ 500 */
481 memset(&ac, 0, sizeof(acct_t)); 501 memset((caddr_t)&ac, 0, sizeof(acct_t));
482 502
483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 503 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 504 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -507,8 +527,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
507 do_div(elapsed, AHZ); 527 do_div(elapsed, AHZ);
508 ac.ac_btime = get_seconds() - elapsed; 528 ac.ac_btime = get_seconds() - elapsed;
509 /* we really need to bite the bullet and change layout */ 529 /* we really need to bite the bullet and change layout */
510 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); 530 ac.ac_uid = orig_cred->uid;
511 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); 531 ac.ac_gid = orig_cred->gid;
512#if ACCT_VERSION==2 532#if ACCT_VERSION==2
513 ac.ac_ahz = AHZ; 533 ac.ac_ahz = AHZ;
514#endif 534#endif
@@ -593,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
593 pacct->ac_flag |= ACORE; 613 pacct->ac_flag |= ACORE;
594 if (current->flags & PF_SIGNALED) 614 if (current->flags & PF_SIGNALED)
595 pacct->ac_flag |= AXSIG; 615 pacct->ac_flag |= AXSIG;
596 pacct->ac_utime += current->utime; 616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
597 pacct->ac_stime += current->stime; 617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
598 pacct->ac_minflt += current->min_flt; 618 pacct->ac_minflt += current->min_flt;
599 pacct->ac_majflt += current->maj_flt; 619 pacct->ac_majflt += current->maj_flt;
600 spin_unlock_irq(&current->sighand->siglock); 620 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d311838485..d5fe7af0de2 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h> 52#include <linux/atomic.h>
53#include <linux/ktime.h> 53#include <linux/ktime.h>
54#include <linux/export.h> 54#include <linux/module.h>
55#include <linux/wait.h> 55#include <linux/wait.h>
56#include <linux/sched.h> 56#include <linux/sched.h>
57#include <linux/slab.h> 57#include <linux/slab.h>
@@ -62,10 +62,8 @@ static async_cookie_t next_cookie = 1;
62#define MAX_WORK 32768 62#define MAX_WORK 32768
63 63
64static LIST_HEAD(async_pending); 64static LIST_HEAD(async_pending);
65static ASYNC_DOMAIN(async_running); 65static LIST_HEAD(async_running);
66static LIST_HEAD(async_domains);
67static DEFINE_SPINLOCK(async_lock); 66static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
69 67
70struct async_entry { 68struct async_entry {
71 struct list_head list; 69 struct list_head list;
@@ -73,23 +71,26 @@ struct async_entry {
73 async_cookie_t cookie; 71 async_cookie_t cookie;
74 async_func_ptr *func; 72 async_func_ptr *func;
75 void *data; 73 void *data;
76 struct async_domain *running; 74 struct list_head *running;
77}; 75};
78 76
79static DECLARE_WAIT_QUEUE_HEAD(async_done); 77static DECLARE_WAIT_QUEUE_HEAD(async_done);
80 78
81static atomic_t entry_count; 79static atomic_t entry_count;
82 80
81extern int initcall_debug;
82
83 83
84/* 84/*
85 * MUST be called with the lock held! 85 * MUST be called with the lock held!
86 */ 86 */
87static async_cookie_t __lowest_in_progress(struct async_domain *running) 87static async_cookie_t __lowest_in_progress(struct list_head *running)
88{ 88{
89 struct async_entry *entry; 89 struct async_entry *entry;
90 90
91 if (!list_empty(&running->domain)) { 91 if (!list_empty(running)) {
92 entry = list_first_entry(&running->domain, typeof(*entry), list); 92 entry = list_first_entry(running,
93 struct async_entry, list);
93 return entry->cookie; 94 return entry->cookie;
94 } 95 }
95 96
@@ -100,7 +101,7 @@ static async_cookie_t __lowest_in_progress(struct async_domain *running)
100 return next_cookie; /* "infinity" value */ 101 return next_cookie; /* "infinity" value */
101} 102}
102 103
103static async_cookie_t lowest_in_progress(struct async_domain *running) 104static async_cookie_t lowest_in_progress(struct list_head *running)
104{ 105{
105 unsigned long flags; 106 unsigned long flags;
106 async_cookie_t ret; 107 async_cookie_t ret;
@@ -119,12 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
119 struct async_entry *entry = 120 struct async_entry *entry =
120 container_of(work, struct async_entry, work); 121 container_of(work, struct async_entry, work);
121 unsigned long flags; 122 unsigned long flags;
122 ktime_t uninitialized_var(calltime), delta, rettime; 123 ktime_t calltime, delta, rettime;
123 struct async_domain *running = entry->running;
124 124
125 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
126 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
127 list_move_tail(&entry->list, &running->domain); 127 list_move_tail(&entry->list, entry->running);
128 spin_unlock_irqrestore(&async_lock, flags); 128 spin_unlock_irqrestore(&async_lock, flags);
129 129
130 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
@@ -147,8 +147,6 @@ static void async_run_entry_fn(struct work_struct *work)
147 /* 3) remove self from the running queue */ 147 /* 3) remove self from the running queue */
148 spin_lock_irqsave(&async_lock, flags); 148 spin_lock_irqsave(&async_lock, flags);
149 list_del(&entry->list); 149 list_del(&entry->list);
150 if (running->registered && --running->count == 0)
151 list_del_init(&running->node);
152 150
153 /* 4) free the entry */ 151 /* 4) free the entry */
154 kfree(entry); 152 kfree(entry);
@@ -160,7 +158,7 @@ static void async_run_entry_fn(struct work_struct *work)
160 wake_up(&async_done); 158 wake_up(&async_done);
161} 159}
162 160
163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) 161static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
164{ 162{
165 struct async_entry *entry; 163 struct async_entry *entry;
166 unsigned long flags; 164 unsigned long flags;
@@ -191,8 +189,6 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
191 spin_lock_irqsave(&async_lock, flags); 189 spin_lock_irqsave(&async_lock, flags);
192 newcookie = entry->cookie = next_cookie++; 190 newcookie = entry->cookie = next_cookie++;
193 list_add_tail(&entry->list, &async_pending); 191 list_add_tail(&entry->list, &async_pending);
194 if (running->registered && running->count++ == 0)
195 list_add_tail(&running->node, &async_domains);
196 atomic_inc(&entry_count); 192 atomic_inc(&entry_count);
197 spin_unlock_irqrestore(&async_lock, flags); 193 spin_unlock_irqrestore(&async_lock, flags);
198 194
@@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
229 * Note: This function may be called from atomic or non-atomic contexts. 225 * Note: This function may be called from atomic or non-atomic contexts.
230 */ 226 */
231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 227async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
232 struct async_domain *running) 228 struct list_head *running)
233{ 229{
234 return __async_schedule(ptr, data, running); 230 return __async_schedule(ptr, data, running);
235} 231}
@@ -242,52 +238,22 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
242 */ 238 */
243void async_synchronize_full(void) 239void async_synchronize_full(void)
244{ 240{
245 mutex_lock(&async_register_mutex);
246 do { 241 do {
247 struct async_domain *domain = NULL; 242 async_synchronize_cookie(next_cookie);
248 243 } while (!list_empty(&async_running) || !list_empty(&async_pending));
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
257} 244}
258EXPORT_SYMBOL_GPL(async_synchronize_full); 245EXPORT_SYMBOL_GPL(async_synchronize_full);
259 246
260/** 247/**
261 * async_unregister_domain - ensure no more anonymous waiters on this domain
262 * @domain: idle domain to flush out of any async_synchronize_full instances
263 *
264 * async_synchronize_{cookie|full}_domain() are not flushed since callers
265 * of these routines should know the lifetime of @domain
266 *
267 * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
268 */
269void async_unregister_domain(struct async_domain *domain)
270{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) ||
274 !list_empty(&domain->domain));
275 domain->registered = 0;
276 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278}
279EXPORT_SYMBOL_GPL(async_unregister_domain);
280
281/**
282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 248 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
283 * @domain: running list to synchronize on 249 * @list: running list to synchronize on
284 * 250 *
285 * This function waits until all asynchronous function calls for the 251 * This function waits until all asynchronous function calls for the
286 * synchronization domain specified by the running list @domain have been done. 252 * synchronization domain specified by the running list @list have been done.
287 */ 253 */
288void async_synchronize_full_domain(struct async_domain *domain) 254void async_synchronize_full_domain(struct list_head *list)
289{ 255{
290 async_synchronize_cookie_domain(next_cookie, domain); 256 async_synchronize_cookie_domain(next_cookie, list);
291} 257}
292EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 258EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
293 259
@@ -297,15 +263,13 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
297 * @running: running list to synchronize on 263 * @running: running list to synchronize on
298 * 264 *
299 * This function waits until all asynchronous function calls for the 265 * This function waits until all asynchronous function calls for the
300 * synchronization domain specified by running list @running submitted 266 * synchronization domain specified by the running list @list submitted
301 * prior to @cookie have been done. 267 * prior to @cookie have been done.
302 */ 268 */
303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) 269void async_synchronize_cookie_domain(async_cookie_t cookie,
270 struct list_head *running)
304{ 271{
305 ktime_t uninitialized_var(starttime), delta, endtime; 272 ktime_t starttime, delta, endtime;
306
307 if (!running)
308 return;
309 273
310 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f1..0a1355ca3d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
45#include <asm/types.h> 45#include <asm/types.h>
46#include <linux/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/export.h> 48#include <linux/module.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/err.h> 50#include <linux/err.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
@@ -61,7 +61,6 @@
61#include <linux/netlink.h> 61#include <linux/netlink.h>
62#include <linux/freezer.h> 62#include <linux/freezer.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/pid_namespace.h>
65 64
66#include "audit.h" 65#include "audit.h"
67 66
@@ -88,11 +87,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
88 87
89/* 88/*
90 * If audit records are to be written to the netlink socket, audit_pid 89 * If audit records are to be written to the netlink socket, audit_pid
91 * contains the pid of the auditd process and audit_nlk_portid contains 90 * contains the pid of the auditd process and audit_nlk_pid contains
92 * the portid to use to send netlink messages to that process. 91 * the pid to use to send netlink messages to that process.
93 */ 92 */
94int audit_pid; 93int audit_pid;
95static int audit_nlk_portid; 94static int audit_nlk_pid;
96 95
97/* If audit_rate_limit is non-zero, limit the rate of sending audit records 96/* If audit_rate_limit is non-zero, limit the rate of sending audit records
98 * to that number per second. This prevents DoS attacks, but results in 97 * to that number per second. This prevents DoS attacks, but results in
@@ -105,7 +104,7 @@ static int audit_backlog_wait_time = 60 * HZ;
105static int audit_backlog_wait_overflow = 0; 104static int audit_backlog_wait_overflow = 0;
106 105
107/* The identity of the user shutting down the audit system. */ 106/* The identity of the user shutting down the audit system. */
108kuid_t audit_sig_uid = INVALID_UID; 107uid_t audit_sig_uid = -1;
109pid_t audit_sig_pid = -1; 108pid_t audit_sig_pid = -1;
110u32 audit_sig_sid = 0; 109u32 audit_sig_sid = 0;
111 110
@@ -265,17 +264,15 @@ void audit_log_lost(const char *message)
265} 264}
266 265
267static int audit_log_config_change(char *function_name, int new, int old, 266static int audit_log_config_change(char *function_name, int new, int old,
268 kuid_t loginuid, u32 sessionid, u32 sid, 267 uid_t loginuid, u32 sessionid, u32 sid,
269 int allow_changes) 268 int allow_changes)
270{ 269{
271 struct audit_buffer *ab; 270 struct audit_buffer *ab;
272 int rc = 0; 271 int rc = 0;
273 272
274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 273 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
275 if (unlikely(!ab))
276 return rc;
277 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 274 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
278 old, from_kuid(&init_user_ns, loginuid), sessionid); 275 old, loginuid, sessionid);
279 if (sid) { 276 if (sid) {
280 char *ctx = NULL; 277 char *ctx = NULL;
281 u32 len; 278 u32 len;
@@ -295,7 +292,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
295} 292}
296 293
297static int audit_do_config_change(char *function_name, int *to_change, 294static int audit_do_config_change(char *function_name, int *to_change,
298 int new, kuid_t loginuid, u32 sessionid, 295 int new, uid_t loginuid, u32 sessionid,
299 u32 sid) 296 u32 sid)
300{ 297{
301 int allow_changes, rc = 0, old = *to_change; 298 int allow_changes, rc = 0, old = *to_change;
@@ -322,21 +319,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
322 return rc; 319 return rc;
323} 320}
324 321
325static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, 322static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
326 u32 sid) 323 u32 sid)
327{ 324{
328 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 325 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
329 limit, loginuid, sessionid, sid); 326 limit, loginuid, sessionid, sid);
330} 327}
331 328
332static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, 329static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
333 u32 sid) 330 u32 sid)
334{ 331{
335 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 332 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
336 limit, loginuid, sessionid, sid); 333 limit, loginuid, sessionid, sid);
337} 334}
338 335
339static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) 336static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
340{ 337{
341 int rc; 338 int rc;
342 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 339 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -351,7 +348,7 @@ static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
351 return rc; 348 return rc;
352} 349}
353 350
354static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) 351static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
355{ 352{
356 if (state != AUDIT_FAIL_SILENT 353 if (state != AUDIT_FAIL_SILENT
357 && state != AUDIT_FAIL_PRINTK 354 && state != AUDIT_FAIL_PRINTK
@@ -387,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
387static void audit_printk_skb(struct sk_buff *skb) 384static void audit_printk_skb(struct sk_buff *skb)
388{ 385{
389 struct nlmsghdr *nlh = nlmsg_hdr(skb); 386 struct nlmsghdr *nlh = nlmsg_hdr(skb);
390 char *data = nlmsg_data(nlh); 387 char *data = NLMSG_DATA(nlh);
391 388
392 if (nlh->nlmsg_type != AUDIT_EOE) { 389 if (nlh->nlmsg_type != AUDIT_EOE) {
393 if (printk_ratelimit()) 390 if (printk_ratelimit())
@@ -404,7 +401,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
404 int err; 401 int err;
405 /* take a reference in case we can't send it and we want to hold it */ 402 /* take a reference in case we can't send it and we want to hold it */
406 skb_get(skb); 403 skb_get(skb);
407 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); 404 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
408 if (err < 0) { 405 if (err < 0) {
409 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 406 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
410 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 407 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -470,6 +467,24 @@ static int kauditd_thread(void *dummy)
470 return 0; 467 return 0;
471} 468}
472 469
470static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
471{
472 struct task_struct *tsk;
473 int err;
474
475 rcu_read_lock();
476 tsk = find_task_by_vpid(pid);
477 if (!tsk) {
478 rcu_read_unlock();
479 return -ESRCH;
480 }
481 get_task_struct(tsk);
482 rcu_read_unlock();
483 err = tty_audit_push_task(tsk, loginuid, sessionid);
484 put_task_struct(tsk);
485 return err;
486}
487
473int audit_send_list(void *_dest) 488int audit_send_list(void *_dest)
474{ 489{
475 struct audit_netlink_list *dest = _dest; 490 struct audit_netlink_list *dest = _dest;
@@ -501,15 +516,14 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
501 if (!skb) 516 if (!skb)
502 return NULL; 517 return NULL;
503 518
504 nlh = nlmsg_put(skb, pid, seq, t, size, flags); 519 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
505 if (!nlh) 520 data = NLMSG_DATA(nlh);
506 goto out_kfree_skb;
507 data = nlmsg_data(nlh);
508 memcpy(data, payload, size); 521 memcpy(data, payload, size);
509 return skb; 522 return skb;
510 523
511out_kfree_skb: 524nlmsg_failure: /* Used by NLMSG_NEW */
512 kfree_skb(skb); 525 if (skb)
526 kfree_skb(skb);
513 return NULL; 527 return NULL;
514} 528}
515 529
@@ -573,11 +587,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
573{ 587{
574 int err = 0; 588 int err = 0;
575 589
576 /* Only support the initial namespaces for now. */
577 if ((current_user_ns() != &init_user_ns) ||
578 (task_active_pid_ns(current) != &init_pid_ns))
579 return -EPERM;
580
581 switch (msg_type) { 590 switch (msg_type) {
582 case AUDIT_GET: 591 case AUDIT_GET:
583 case AUDIT_LIST: 592 case AUDIT_LIST:
@@ -592,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
592 case AUDIT_TTY_SET: 601 case AUDIT_TTY_SET:
593 case AUDIT_TRIM: 602 case AUDIT_TRIM:
594 case AUDIT_MAKE_EQUIV: 603 case AUDIT_MAKE_EQUIV:
595 if (!capable(CAP_AUDIT_CONTROL)) 604 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
596 err = -EPERM; 605 err = -EPERM;
597 break; 606 break;
598 case AUDIT_USER: 607 case AUDIT_USER:
599 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
600 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
601 if (!capable(CAP_AUDIT_WRITE)) 610 if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
602 err = -EPERM; 611 err = -EPERM;
603 break; 612 break;
604 default: /* bad msg */ 613 default: /* bad msg */
@@ -609,7 +618,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
609} 618}
610 619
611static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 620static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
612 kuid_t auid, u32 ses, u32 sid) 621 u32 pid, u32 uid, uid_t auid, u32 ses,
622 u32 sid)
613{ 623{
614 int rc = 0; 624 int rc = 0;
615 char *ctx = NULL; 625 char *ctx = NULL;
@@ -621,12 +631,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
621 } 631 }
622 632
623 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
624 if (unlikely(!*ab)) 634 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
625 return rc; 635 pid, uid, auid, ses);
626 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
627 task_tgid_vnr(current),
628 from_kuid(&init_user_ns, current_uid()),
629 from_kuid(&init_user_ns, auid), ses);
630 if (sid) { 636 if (sid) {
631 rc = security_secid_to_secctx(sid, &ctx, &len); 637 rc = security_secid_to_secctx(sid, &ctx, &len);
632 if (rc) 638 if (rc)
@@ -642,13 +648,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
642 648
643static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 649static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
644{ 650{
645 u32 seq, sid; 651 u32 uid, pid, seq, sid;
646 void *data; 652 void *data;
647 struct audit_status *status_get, status_set; 653 struct audit_status *status_get, status_set;
648 int err; 654 int err;
649 struct audit_buffer *ab; 655 struct audit_buffer *ab;
650 u16 msg_type = nlh->nlmsg_type; 656 u16 msg_type = nlh->nlmsg_type;
651 kuid_t loginuid; /* loginuid of sender */ 657 uid_t loginuid; /* loginuid of sender */
652 u32 sessionid; 658 u32 sessionid;
653 struct audit_sig_info *sig_data; 659 struct audit_sig_info *sig_data;
654 char *ctx = NULL; 660 char *ctx = NULL;
@@ -668,11 +674,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
668 return err; 674 return err;
669 } 675 }
670 676
677 pid = NETLINK_CREDS(skb)->pid;
678 uid = NETLINK_CREDS(skb)->uid;
671 loginuid = audit_get_loginuid(current); 679 loginuid = audit_get_loginuid(current);
672 sessionid = audit_get_sessionid(current); 680 sessionid = audit_get_sessionid(current);
673 security_task_getsecid(current, &sid); 681 security_task_getsecid(current, &sid);
674 seq = nlh->nlmsg_seq; 682 seq = nlh->nlmsg_seq;
675 data = nlmsg_data(nlh); 683 data = NLMSG_DATA(nlh);
676 684
677 switch (msg_type) { 685 switch (msg_type) {
678 case AUDIT_GET: 686 case AUDIT_GET:
@@ -683,7 +691,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
683 status_set.backlog_limit = audit_backlog_limit; 691 status_set.backlog_limit = audit_backlog_limit;
684 status_set.lost = atomic_read(&audit_lost); 692 status_set.lost = atomic_read(&audit_lost);
685 status_set.backlog = skb_queue_len(&audit_skb_queue); 693 status_set.backlog = skb_queue_len(&audit_skb_queue);
686 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 694 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
687 &status_set, sizeof(status_set)); 695 &status_set, sizeof(status_set));
688 break; 696 break;
689 case AUDIT_SET: 697 case AUDIT_SET:
@@ -711,7 +719,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
711 sessionid, sid, 1); 719 sessionid, sid, 1);
712 720
713 audit_pid = new_pid; 721 audit_pid = new_pid;
714 audit_nlk_portid = NETLINK_CB(skb).portid; 722 audit_nlk_pid = NETLINK_CB(skb).pid;
715 } 723 }
716 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 724 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
717 err = audit_set_rate_limit(status_get->rate_limit, 725 err = audit_set_rate_limit(status_get->rate_limit,
@@ -729,16 +737,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
729 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 737 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
730 return 0; 738 return 0;
731 739
732 err = audit_filter_user(); 740 err = audit_filter_user(&NETLINK_CB(skb));
733 if (err == 1) { 741 if (err == 1) {
734 err = 0; 742 err = 0;
735 if (msg_type == AUDIT_USER_TTY) { 743 if (msg_type == AUDIT_USER_TTY) {
736 err = tty_audit_push_task(current, loginuid, 744 err = audit_prepare_user_tty(pid, loginuid,
737 sessionid); 745 sessionid);
738 if (err) 746 if (err)
739 break; 747 break;
740 } 748 }
741 audit_log_common_recv_msg(&ab, msg_type, 749 audit_log_common_recv_msg(&ab, msg_type, pid, uid,
742 loginuid, sessionid, sid); 750 loginuid, sessionid, sid);
743 751
744 if (msg_type != AUDIT_USER_TTY) 752 if (msg_type != AUDIT_USER_TTY)
@@ -754,7 +762,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
754 size--; 762 size--;
755 audit_log_n_untrustedstring(ab, data, size); 763 audit_log_n_untrustedstring(ab, data, size);
756 } 764 }
757 audit_set_pid(ab, NETLINK_CB(skb).portid); 765 audit_set_pid(ab, pid);
758 audit_log_end(ab); 766 audit_log_end(ab);
759 } 767 }
760 break; 768 break;
@@ -763,8 +771,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
763 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 771 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
764 return -EINVAL; 772 return -EINVAL;
765 if (audit_enabled == AUDIT_LOCKED) { 773 if (audit_enabled == AUDIT_LOCKED) {
766 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 774 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
767 loginuid, sessionid, sid); 775 uid, loginuid, sessionid, sid);
768 776
769 audit_log_format(ab, " audit_enabled=%d res=0", 777 audit_log_format(ab, " audit_enabled=%d res=0",
770 audit_enabled); 778 audit_enabled);
@@ -773,8 +781,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
773 } 781 }
774 /* fallthrough */ 782 /* fallthrough */
775 case AUDIT_LIST: 783 case AUDIT_LIST:
776 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, 784 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
777 seq, data, nlmsg_len(nlh), 785 uid, seq, data, nlmsg_len(nlh),
778 loginuid, sessionid, sid); 786 loginuid, sessionid, sid);
779 break; 787 break;
780 case AUDIT_ADD_RULE: 788 case AUDIT_ADD_RULE:
@@ -782,8 +790,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
782 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 790 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
783 return -EINVAL; 791 return -EINVAL;
784 if (audit_enabled == AUDIT_LOCKED) { 792 if (audit_enabled == AUDIT_LOCKED) {
785 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 793 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
786 loginuid, sessionid, sid); 794 uid, loginuid, sessionid, sid);
787 795
788 audit_log_format(ab, " audit_enabled=%d res=0", 796 audit_log_format(ab, " audit_enabled=%d res=0",
789 audit_enabled); 797 audit_enabled);
@@ -792,15 +800,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
792 } 800 }
793 /* fallthrough */ 801 /* fallthrough */
794 case AUDIT_LIST_RULES: 802 case AUDIT_LIST_RULES:
795 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, 803 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
796 seq, data, nlmsg_len(nlh), 804 uid, seq, data, nlmsg_len(nlh),
797 loginuid, sessionid, sid); 805 loginuid, sessionid, sid);
798 break; 806 break;
799 case AUDIT_TRIM: 807 case AUDIT_TRIM:
800 audit_trim_trees(); 808 audit_trim_trees();
801 809
802 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 810 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
803 loginuid, sessionid, sid); 811 uid, loginuid, sessionid, sid);
804 812
805 audit_log_format(ab, " op=trim res=1"); 813 audit_log_format(ab, " op=trim res=1");
806 audit_log_end(ab); 814 audit_log_end(ab);
@@ -831,8 +839,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
831 /* OK, here comes... */ 839 /* OK, here comes... */
832 err = audit_tag_tree(old, new); 840 err = audit_tag_tree(old, new);
833 841
834 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, 842 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
835 loginuid, sessionid, sid); 843 uid, loginuid, sessionid, sid);
836 844
837 audit_log_format(ab, " op=make_equiv old="); 845 audit_log_format(ab, " op=make_equiv old=");
838 audit_log_untrustedstring(ab, old); 846 audit_log_untrustedstring(ab, old);
@@ -857,41 +865,53 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
857 security_release_secctx(ctx, len); 865 security_release_secctx(ctx, len);
858 return -ENOMEM; 866 return -ENOMEM;
859 } 867 }
860 sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); 868 sig_data->uid = audit_sig_uid;
861 sig_data->pid = audit_sig_pid; 869 sig_data->pid = audit_sig_pid;
862 if (audit_sig_sid) { 870 if (audit_sig_sid) {
863 memcpy(sig_data->ctx, ctx, len); 871 memcpy(sig_data->ctx, ctx, len);
864 security_release_secctx(ctx, len); 872 security_release_secctx(ctx, len);
865 } 873 }
866 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 874 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
867 0, 0, sig_data, sizeof(*sig_data) + len); 875 0, 0, sig_data, sizeof(*sig_data) + len);
868 kfree(sig_data); 876 kfree(sig_data);
869 break; 877 break;
870 case AUDIT_TTY_GET: { 878 case AUDIT_TTY_GET: {
871 struct audit_tty_status s; 879 struct audit_tty_status s;
872 struct task_struct *tsk = current; 880 struct task_struct *tsk;
873 881 unsigned long flags;
874 spin_lock_irq(&tsk->sighand->siglock); 882
875 s.enabled = tsk->signal->audit_tty != 0; 883 rcu_read_lock();
876 spin_unlock_irq(&tsk->sighand->siglock); 884 tsk = find_task_by_vpid(pid);
877 885 if (tsk && lock_task_sighand(tsk, &flags)) {
878 audit_send_reply(NETLINK_CB(skb).portid, seq, 886 s.enabled = tsk->signal->audit_tty != 0;
879 AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); 887 unlock_task_sighand(tsk, &flags);
888 } else
889 err = -ESRCH;
890 rcu_read_unlock();
891
892 if (!err)
893 audit_send_reply(NETLINK_CB(skb).pid, seq,
894 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
880 break; 895 break;
881 } 896 }
882 case AUDIT_TTY_SET: { 897 case AUDIT_TTY_SET: {
883 struct audit_tty_status *s; 898 struct audit_tty_status *s;
884 struct task_struct *tsk = current; 899 struct task_struct *tsk;
900 unsigned long flags;
885 901
886 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
887 return -EINVAL; 903 return -EINVAL;
888 s = data; 904 s = data;
889 if (s->enabled != 0 && s->enabled != 1) 905 if (s->enabled != 0 && s->enabled != 1)
890 return -EINVAL; 906 return -EINVAL;
891 907 rcu_read_lock();
892 spin_lock_irq(&tsk->sighand->siglock); 908 tsk = find_task_by_vpid(pid);
893 tsk->signal->audit_tty = s->enabled != 0; 909 if (tsk && lock_task_sighand(tsk, &flags)) {
894 spin_unlock_irq(&tsk->sighand->siglock); 910 tsk->signal->audit_tty = s->enabled != 0;
911 unlock_task_sighand(tsk, &flags);
912 } else
913 err = -ESRCH;
914 rcu_read_unlock();
895 break; 915 break;
896 } 916 }
897 default: 917 default:
@@ -941,16 +961,14 @@ static void audit_receive(struct sk_buff *skb)
941static int __init audit_init(void) 961static int __init audit_init(void)
942{ 962{
943 int i; 963 int i;
944 struct netlink_kernel_cfg cfg = {
945 .input = audit_receive,
946 };
947 964
948 if (audit_initialized == AUDIT_DISABLED) 965 if (audit_initialized == AUDIT_DISABLED)
949 return 0; 966 return 0;
950 967
951 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
952 audit_default ? "enabled" : "disabled"); 969 audit_default ? "enabled" : "disabled");
953 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); 970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
971 audit_receive, NULL, THIS_MODULE);
954 if (!audit_sock) 972 if (!audit_sock)
955 audit_panic("cannot initialize netlink socket"); 973 audit_panic("cannot initialize netlink socket");
956 else 974 else
@@ -1042,15 +1060,13 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1042 1060
1043 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); 1061 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1044 if (!ab->skb) 1062 if (!ab->skb)
1045 goto err; 1063 goto nlmsg_failure;
1046 1064
1047 nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); 1065 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1048 if (!nlh)
1049 goto out_kfree_skb;
1050 1066
1051 return ab; 1067 return ab;
1052 1068
1053out_kfree_skb: 1069nlmsg_failure: /* Used by NLMSG_NEW */
1054 kfree_skb(ab->skb); 1070 kfree_skb(ab->skb);
1055 ab->skb = NULL; 1071 ab->skb = NULL;
1056err: 1072err:
@@ -1101,23 +1117,6 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1101 } 1117 }
1102} 1118}
1103 1119
1104/*
1105 * Wait for auditd to drain the queue a little
1106 */
1107static void wait_for_auditd(unsigned long sleep_time)
1108{
1109 DECLARE_WAITQUEUE(wait, current);
1110 set_current_state(TASK_INTERRUPTIBLE);
1111 add_wait_queue(&audit_backlog_wait, &wait);
1112
1113 if (audit_backlog_limit &&
1114 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1115 schedule_timeout(sleep_time);
1116
1117 __set_current_state(TASK_RUNNING);
1118 remove_wait_queue(&audit_backlog_wait, &wait);
1119}
1120
1121/* Obtain an audit buffer. This routine does locking to obtain the 1120/* Obtain an audit buffer. This routine does locking to obtain the
1122 * audit buffer, but then no locking is required for calls to 1121 * audit buffer, but then no locking is required for calls to
1123 * audit_log_*format. If the tsk is a task that is currently in a 1122 * audit_log_*format. If the tsk is a task that is currently in a
@@ -1163,13 +1162,20 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1163 1162
1164 while (audit_backlog_limit 1163 while (audit_backlog_limit
1165 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1164 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1166 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { 1165 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
1167 unsigned long sleep_time; 1166 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
1167
1168 /* Wait for auditd to drain the queue a little */
1169 DECLARE_WAITQUEUE(wait, current);
1170 set_current_state(TASK_INTERRUPTIBLE);
1171 add_wait_queue(&audit_backlog_wait, &wait);
1168 1172
1169 sleep_time = timeout_start + audit_backlog_wait_time - 1173 if (audit_backlog_limit &&
1170 jiffies; 1174 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1171 if ((long)sleep_time > 0) 1175 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
1172 wait_for_auditd(sleep_time); 1176
1177 __set_current_state(TASK_RUNNING);
1178 remove_wait_queue(&audit_backlog_wait, &wait);
1173 continue; 1179 continue;
1174 } 1180 }
1175 if (audit_rate_check() && printk_ratelimit()) 1181 if (audit_rate_check() && printk_ratelimit())
@@ -1254,13 +1260,12 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1254 avail = audit_expand(ab, 1260 avail = audit_expand(ab,
1255 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1256 if (!avail) 1262 if (!avail)
1257 goto out_va_end; 1263 goto out;
1258 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1259 } 1265 }
1266 va_end(args2);
1260 if (len > 0) 1267 if (len > 0)
1261 skb_put(skb, len); 1268 skb_put(skb, len);
1262out_va_end:
1263 va_end(args2);
1264out: 1269out:
1265 return; 1270 return;
1266} 1271}
@@ -1412,12 +1417,12 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1412 1417
1413/* This is a helper-function to print the escaped d_path */ 1418/* This is a helper-function to print the escaped d_path */
1414void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1419void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1415 const struct path *path) 1420 struct path *path)
1416{ 1421{
1417 char *p, *pathname; 1422 char *p, *pathname;
1418 1423
1419 if (prefix) 1424 if (prefix)
1420 audit_log_format(ab, "%s", prefix); 1425 audit_log_format(ab, " %s", prefix);
1421 1426
1422 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1427 /* We will allow 11 spaces for ' (deleted)' to be appended */
1423 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1428 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
@@ -1444,29 +1449,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1444} 1449}
1445 1450
1446/** 1451/**
1447 * audit_log_link_denied - report a link restriction denial
1448 * @operation: specific link opreation
1449 * @link: the path that triggered the restriction
1450 */
1451void audit_log_link_denied(const char *operation, struct path *link)
1452{
1453 struct audit_buffer *ab;
1454
1455 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1456 AUDIT_ANOM_LINK);
1457 if (!ab)
1458 return;
1459 audit_log_format(ab, "op=%s action=denied", operation);
1460 audit_log_format(ab, " pid=%d comm=", current->pid);
1461 audit_log_untrustedstring(ab, current->comm);
1462 audit_log_d_path(ab, " path=", link);
1463 audit_log_format(ab, " dev=");
1464 audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
1465 audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
1466 audit_log_end(ab);
1467}
1468
1469/**
1470 * audit_log_end - end one audit record 1452 * audit_log_end - end one audit record
1471 * @ab: the audit_buffer 1453 * @ab: the audit_buffer
1472 * 1454 *
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1..91e7071c4d2 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,8 +36,12 @@ enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context. 36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can 37 * No syscall-specific audit records can
38 * be generated. */ 38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
39 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, 43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
40 * and fill it in at syscall 44 * and always fill it in at syscall
41 * entry time. This makes a full 45 * entry time. This makes a full
42 * syscall record available if some 46 * syscall record available if some
43 * other part of the kernel decides it 47 * other part of the kernel decides it
@@ -74,15 +78,10 @@ static inline int audit_hash_ino(u32 ino)
74 return (ino & (AUDIT_INODE_BUCKETS-1)); 78 return (ino & (AUDIT_INODE_BUCKETS-1));
75} 79}
76 80
77/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1
79
80extern int audit_match_class(int class, unsigned syscall); 81extern int audit_match_class(int class, unsigned syscall);
81extern int audit_comparator(const u32 left, const u32 op, const u32 right); 82extern int audit_comparator(const u32 left, const u32 op, const u32 right);
82extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); 83extern int audit_compare_dname_path(const char *dname, const char *path,
83extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); 84 int *dirlen);
84extern int parent_len(const char *path);
85extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
86extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
87 int done, int multi, 86 int done, int multi,
88 const void *payload, int size); 87 const void *payload, int size);
@@ -149,7 +148,7 @@ extern void audit_kill_trees(struct list_head *);
149extern char *audit_unpack_string(void **, size_t *, size_t); 148extern char *audit_unpack_string(void **, size_t *, size_t);
150 149
151extern pid_t audit_sig_pid; 150extern pid_t audit_sig_pid;
152extern kuid_t audit_sig_uid; 151extern uid_t audit_sig_uid;
153extern u32 audit_sig_sid; 152extern u32 audit_sig_sid;
154 153
155#ifdef CONFIG_AUDITSYSCALL 154#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d..5bf0790497e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,8 @@ static void untag_chunk(struct node *p)
249 list_del_rcu(&chunk->hash); 249 list_del_rcu(&chunk->hash);
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry, audit_tree_group); 252 fsnotify_destroy_mark(entry);
253 fsnotify_put_mark(entry);
253 goto out; 254 goto out;
254 } 255 }
255 256
@@ -258,7 +259,7 @@ static void untag_chunk(struct node *p)
258 259
259 fsnotify_duplicate_mark(&new->mark, entry); 260 fsnotify_duplicate_mark(&new->mark, entry);
260 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
261 fsnotify_put_mark(&new->mark); 262 free_chunk(new);
262 goto Fallback; 263 goto Fallback;
263 } 264 }
264 265
@@ -291,8 +292,8 @@ static void untag_chunk(struct node *p)
291 owner->root = new; 292 owner->root = new;
292 spin_unlock(&hash_lock); 293 spin_unlock(&hash_lock);
293 spin_unlock(&entry->lock); 294 spin_unlock(&entry->lock);
294 fsnotify_destroy_mark(entry, audit_tree_group); 295 fsnotify_destroy_mark(entry);
295 fsnotify_put_mark(&new->mark); /* drop initial reference */ 296 fsnotify_put_mark(entry);
296 goto out; 297 goto out;
297 298
298Fallback: 299Fallback:
@@ -321,7 +322,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
321 322
322 entry = &chunk->mark; 323 entry = &chunk->mark;
323 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { 324 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
324 fsnotify_put_mark(entry); 325 free_chunk(chunk);
325 return -ENOSPC; 326 return -ENOSPC;
326 } 327 }
327 328
@@ -331,7 +332,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
331 spin_unlock(&hash_lock); 332 spin_unlock(&hash_lock);
332 chunk->dead = 1; 333 chunk->dead = 1;
333 spin_unlock(&entry->lock); 334 spin_unlock(&entry->lock);
334 fsnotify_destroy_mark(entry, audit_tree_group); 335 fsnotify_destroy_mark(entry);
335 fsnotify_put_mark(entry); 336 fsnotify_put_mark(entry);
336 return 0; 337 return 0;
337 } 338 }
@@ -346,7 +347,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
346 insert_hash(chunk); 347 insert_hash(chunk);
347 spin_unlock(&hash_lock); 348 spin_unlock(&hash_lock);
348 spin_unlock(&entry->lock); 349 spin_unlock(&entry->lock);
349 fsnotify_put_mark(entry); /* drop initial reference */
350 return 0; 350 return 0;
351} 351}
352 352
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
396 fsnotify_duplicate_mark(chunk_entry, old_entry); 396 fsnotify_duplicate_mark(chunk_entry, old_entry);
397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
398 spin_unlock(&old_entry->lock); 398 spin_unlock(&old_entry->lock);
399 fsnotify_put_mark(chunk_entry); 399 free_chunk(chunk);
400 fsnotify_put_mark(old_entry); 400 fsnotify_put_mark(old_entry);
401 return -ENOSPC; 401 return -ENOSPC;
402 } 402 }
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
412 spin_unlock(&chunk_entry->lock); 412 spin_unlock(&chunk_entry->lock);
413 spin_unlock(&old_entry->lock); 413 spin_unlock(&old_entry->lock);
414 414
415 fsnotify_destroy_mark(chunk_entry, audit_tree_group); 415 fsnotify_destroy_mark(chunk_entry);
416 416
417 fsnotify_put_mark(chunk_entry); 417 fsnotify_put_mark(chunk_entry);
418 fsnotify_put_mark(old_entry); 418 fsnotify_put_mark(old_entry);
@@ -443,32 +443,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
443 spin_unlock(&hash_lock); 443 spin_unlock(&hash_lock);
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry, audit_tree_group); 446 fsnotify_destroy_mark(old_entry);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 447 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
448 fsnotify_put_mark(old_entry); /* and kill it */
449 return 0; 449 return 0;
450} 450}
451 451
452static void audit_log_remove_rule(struct audit_krule *rule)
453{
454 struct audit_buffer *ab;
455
456 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
457 if (unlikely(!ab))
458 return;
459 audit_log_format(ab, "op=");
460 audit_log_string(ab, "remove rule");
461 audit_log_format(ab, " dir=");
462 audit_log_untrustedstring(ab, rule->tree->pathname);
463 audit_log_key(ab, rule->filterkey);
464 audit_log_format(ab, " list=%d res=1", rule->listnr);
465 audit_log_end(ab);
466}
467
468static void kill_rules(struct audit_tree *tree) 452static void kill_rules(struct audit_tree *tree)
469{ 453{
470 struct audit_krule *rule, *next; 454 struct audit_krule *rule, *next;
471 struct audit_entry *entry; 455 struct audit_entry *entry;
456 struct audit_buffer *ab;
472 457
473 list_for_each_entry_safe(rule, next, &tree->rules, rlist) { 458 list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
474 entry = container_of(rule, struct audit_entry, rule); 459 entry = container_of(rule, struct audit_entry, rule);
@@ -476,7 +461,14 @@ static void kill_rules(struct audit_tree *tree)
476 list_del_init(&rule->rlist); 461 list_del_init(&rule->rlist);
477 if (rule->tree) { 462 if (rule->tree) {
478 /* not a half-baked one */ 463 /* not a half-baked one */
479 audit_log_remove_rule(rule); 464 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
465 audit_log_format(ab, "op=");
466 audit_log_string(ab, "remove rule");
467 audit_log_format(ab, " dir=");
468 audit_log_untrustedstring(ab, rule->tree->pathname);
469 audit_log_key(ab, rule->filterkey);
470 audit_log_format(ab, " list=%d res=1", rule->listnr);
471 audit_log_end(ab);
480 rule->tree = NULL; 472 rule->tree = NULL;
481 list_del_rcu(&entry->list); 473 list_del_rcu(&entry->list);
482 list_del(&entry->rule.list); 474 list_del(&entry->rule.list);
@@ -603,7 +595,7 @@ void audit_trim_trees(void)
603 595
604 root_mnt = collect_mounts(&path); 596 root_mnt = collect_mounts(&path);
605 path_put(&path); 597 path_put(&path);
606 if (IS_ERR(root_mnt)) 598 if (!root_mnt)
607 goto skip_it; 599 goto skip_it;
608 600
609 spin_lock(&hash_lock); 601 spin_lock(&hash_lock);
@@ -677,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
677 goto Err; 669 goto Err;
678 mnt = collect_mounts(&path); 670 mnt = collect_mounts(&path);
679 path_put(&path); 671 path_put(&path);
680 if (IS_ERR(mnt)) { 672 if (!mnt) {
681 err = PTR_ERR(mnt); 673 err = -ENOMEM;
682 goto Err; 674 goto Err;
683 } 675 }
684 676
@@ -727,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
727 return err; 719 return err;
728 tagged = collect_mounts(&path2); 720 tagged = collect_mounts(&path2);
729 path_put(&path2); 721 path_put(&path2);
730 if (IS_ERR(tagged)) 722 if (!tagged)
731 return PTR_ERR(tagged); 723 return -ENOMEM;
732 724
733 err = kern_path(old, 0, &path1); 725 err = kern_path(old, 0, &path1);
734 if (err) { 726 if (err) {
@@ -924,12 +916,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
924 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); 916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
925 917
926 evict_chunk(chunk); 918 evict_chunk(chunk);
927 919 fsnotify_put_mark(entry);
928 /*
929 * We are guaranteed to have at least one reference to the mark from
930 * either the inode or the caller of fsnotify_destroy_mark().
931 */
932 BUG_ON(atomic_read(&entry->refcnt) < 1);
933} 920}
934 921
935static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, 922static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369..e683869365d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,10 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
240 if (audit_enabled) { 240 if (audit_enabled) {
241 struct audit_buffer *ab; 241 struct audit_buffer *ab;
242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
243 if (unlikely(!ab))
244 return;
245 audit_log_format(ab, "auid=%u ses=%u op=", 243 audit_log_format(ab, "auid=%u ses=%u op=",
246 from_kuid(&init_user_ns, audit_get_loginuid(current)), 244 audit_get_loginuid(current),
247 audit_get_sessionid(current)); 245 audit_get_sessionid(current));
248 audit_log_string(ab, op); 246 audit_log_string(ab, op);
249 audit_log_format(ab, " path="); 247 audit_log_format(ab, " path=");
@@ -267,8 +265,7 @@ static void audit_update_watch(struct audit_parent *parent,
267 /* Run all of the watches on this parent looking for the one that 265 /* Run all of the watches on this parent looking for the one that
268 * matches the given dname */ 266 * matches the given dname */
269 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
270 if (audit_compare_dname_path(dname, owatch->path, 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
271 AUDIT_NAME_FULL))
272 continue; 269 continue;
273 270
274 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
@@ -352,21 +349,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
352 } 349 }
353 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
354 351
355 fsnotify_destroy_mark(&parent->mark, audit_watch_group); 352 fsnotify_destroy_mark(&parent->mark);
356} 353}
357 354
358/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
359static int audit_get_nd(struct audit_watch *watch, struct path *parent) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
360{ 357{
361 struct dentry *d = kern_path_locked(watch->path, parent); 358 struct nameidata nd;
362 if (IS_ERR(d)) 359 struct dentry *d;
360 int err;
361
362 err = kern_path_parent(watch->path, &nd);
363 if (err)
364 return err;
365
366 if (nd.last_type != LAST_NORM) {
367 path_put(&nd.path);
368 return -EINVAL;
369 }
370
371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 if (IS_ERR(d)) {
374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 path_put(&nd.path);
363 return PTR_ERR(d); 376 return PTR_ERR(d);
364 mutex_unlock(&parent->dentry->d_inode->i_mutex); 377 }
365 if (d->d_inode) { 378 if (d->d_inode) {
366 /* update watch filter fields */ 379 /* update watch filter fields */
367 watch->dev = d->d_inode->i_sb->s_dev; 380 watch->dev = d->d_inode->i_sb->s_dev;
368 watch->ino = d->d_inode->i_ino; 381 watch->ino = d->d_inode->i_ino;
369 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
384
385 *parent = nd.path;
370 dput(d); 386 dput(d);
371 return 0; 387 return 0;
372} 388}
@@ -459,7 +475,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
459 475
460 if (list_empty(&parent->watches)) { 476 if (list_empty(&parent->watches)) {
461 audit_get_parent(parent); 477 audit_get_parent(parent);
462 fsnotify_destroy_mark(&parent->mark, audit_watch_group); 478 fsnotify_destroy_mark(&parent->mark);
463 audit_put_parent(parent); 479 audit_put_parent(parent);
464 } 480 }
465 } 481 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06..f8277c80d67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,15 +235,13 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
235 switch(listnr) { 235 switch(listnr) {
236 default: 236 default:
237 goto exit_err; 237 goto exit_err;
238 case AUDIT_FILTER_USER:
239 case AUDIT_FILTER_TYPE:
238#ifdef CONFIG_AUDITSYSCALL 240#ifdef CONFIG_AUDITSYSCALL
239 case AUDIT_FILTER_ENTRY: 241 case AUDIT_FILTER_ENTRY:
240 if (rule->action == AUDIT_ALWAYS)
241 goto exit_err;
242 case AUDIT_FILTER_EXIT: 242 case AUDIT_FILTER_EXIT:
243 case AUDIT_FILTER_TASK: 243 case AUDIT_FILTER_TASK:
244#endif 244#endif
245 case AUDIT_FILTER_USER:
246 case AUDIT_FILTER_TYPE:
247 ; 245 ;
248 } 246 }
249 if (unlikely(rule->action == AUDIT_POSSIBLE)) { 247 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -342,8 +340,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
342 340
343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 341 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
344 f->val = rule->values[i]; 342 f->val = rule->values[i];
345 f->uid = INVALID_UID;
346 f->gid = INVALID_GID;
347 343
348 err = -EINVAL; 344 err = -EINVAL;
349 if (f->op == Audit_bad) 345 if (f->op == Audit_bad)
@@ -352,32 +348,16 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
352 switch(f->type) { 348 switch(f->type) {
353 default: 349 default:
354 goto exit_free; 350 goto exit_free;
351 case AUDIT_PID:
355 case AUDIT_UID: 352 case AUDIT_UID:
356 case AUDIT_EUID: 353 case AUDIT_EUID:
357 case AUDIT_SUID: 354 case AUDIT_SUID:
358 case AUDIT_FSUID: 355 case AUDIT_FSUID:
359 case AUDIT_LOGINUID:
360 /* bit ops not implemented for uid comparisons */
361 if (f->op == Audit_bitmask || f->op == Audit_bittest)
362 goto exit_free;
363
364 f->uid = make_kuid(current_user_ns(), f->val);
365 if (!uid_valid(f->uid))
366 goto exit_free;
367 break;
368 case AUDIT_GID: 356 case AUDIT_GID:
369 case AUDIT_EGID: 357 case AUDIT_EGID:
370 case AUDIT_SGID: 358 case AUDIT_SGID:
371 case AUDIT_FSGID: 359 case AUDIT_FSGID:
372 /* bit ops not implemented for gid comparisons */ 360 case AUDIT_LOGINUID:
373 if (f->op == Audit_bitmask || f->op == Audit_bittest)
374 goto exit_free;
375
376 f->gid = make_kgid(current_user_ns(), f->val);
377 if (!gid_valid(f->gid))
378 goto exit_free;
379 break;
380 case AUDIT_PID:
381 case AUDIT_PERS: 361 case AUDIT_PERS:
382 case AUDIT_MSGTYPE: 362 case AUDIT_MSGTYPE:
383 case AUDIT_PPID: 363 case AUDIT_PPID:
@@ -405,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
405 goto exit_free; 385 goto exit_free;
406 break; 386 break;
407 case AUDIT_FILETYPE: 387 case AUDIT_FILETYPE:
408 if (f->val & ~S_IFMT) 388 if ((f->val & ~S_IFMT) > S_IFMT)
409 goto exit_free; 389 goto exit_free;
410 break; 390 break;
411 case AUDIT_INODE: 391 case AUDIT_INODE:
@@ -455,39 +435,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
455 435
456 f->type = data->fields[i]; 436 f->type = data->fields[i];
457 f->val = data->values[i]; 437 f->val = data->values[i];
458 f->uid = INVALID_UID;
459 f->gid = INVALID_GID;
460 f->lsm_str = NULL; 438 f->lsm_str = NULL;
461 f->lsm_rule = NULL; 439 f->lsm_rule = NULL;
462 switch(f->type) { 440 switch(f->type) {
441 case AUDIT_PID:
463 case AUDIT_UID: 442 case AUDIT_UID:
464 case AUDIT_EUID: 443 case AUDIT_EUID:
465 case AUDIT_SUID: 444 case AUDIT_SUID:
466 case AUDIT_FSUID: 445 case AUDIT_FSUID:
467 case AUDIT_LOGINUID:
468 case AUDIT_OBJ_UID:
469 /* bit ops not implemented for uid comparisons */
470 if (f->op == Audit_bitmask || f->op == Audit_bittest)
471 goto exit_free;
472
473 f->uid = make_kuid(current_user_ns(), f->val);
474 if (!uid_valid(f->uid))
475 goto exit_free;
476 break;
477 case AUDIT_GID: 446 case AUDIT_GID:
478 case AUDIT_EGID: 447 case AUDIT_EGID:
479 case AUDIT_SGID: 448 case AUDIT_SGID:
480 case AUDIT_FSGID: 449 case AUDIT_FSGID:
481 case AUDIT_OBJ_GID: 450 case AUDIT_LOGINUID:
482 /* bit ops not implemented for gid comparisons */
483 if (f->op == Audit_bitmask || f->op == Audit_bittest)
484 goto exit_free;
485
486 f->gid = make_kgid(current_user_ns(), f->val);
487 if (!gid_valid(f->gid))
488 goto exit_free;
489 break;
490 case AUDIT_PID:
491 case AUDIT_PERS: 451 case AUDIT_PERS:
492 case AUDIT_MSGTYPE: 452 case AUDIT_MSGTYPE:
493 case AUDIT_PPID: 453 case AUDIT_PPID:
@@ -562,6 +522,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
562 goto exit_free; 522 goto exit_free;
563 break; 523 break;
564 case AUDIT_FILTERKEY: 524 case AUDIT_FILTERKEY:
525 err = -EINVAL;
565 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) 526 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
566 goto exit_free; 527 goto exit_free;
567 str = audit_unpack_string(&bufp, &remain, f->val); 528 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -575,11 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
575 goto exit_free; 536 goto exit_free;
576 break; 537 break;
577 case AUDIT_FILETYPE: 538 case AUDIT_FILETYPE:
578 if (f->val & ~S_IFMT) 539 if ((f->val & ~S_IFMT) > S_IFMT)
579 goto exit_free;
580 break;
581 case AUDIT_FIELD_COMPARE:
582 if (f->val > AUDIT_MAX_FIELD_COMPARE)
583 goto exit_free; 540 goto exit_free;
584 break; 541 break;
585 default: 542 default:
@@ -743,23 +700,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
743 if (strcmp(a->filterkey, b->filterkey)) 700 if (strcmp(a->filterkey, b->filterkey))
744 return 1; 701 return 1;
745 break; 702 break;
746 case AUDIT_UID:
747 case AUDIT_EUID:
748 case AUDIT_SUID:
749 case AUDIT_FSUID:
750 case AUDIT_LOGINUID:
751 case AUDIT_OBJ_UID:
752 if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
753 return 1;
754 break;
755 case AUDIT_GID:
756 case AUDIT_EGID:
757 case AUDIT_SGID:
758 case AUDIT_FSGID:
759 case AUDIT_OBJ_GID:
760 if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
761 return 1;
762 break;
763 default: 703 default:
764 if (a->fields[i].val != b->fields[i].val) 704 if (a->fields[i].val != b->fields[i].val)
765 return 1; 705 return 1;
@@ -1109,7 +1049,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1109} 1049}
1110 1050
1111/* Log rule additions and removals */ 1051/* Log rule additions and removals */
1112static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, 1052static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1113 char *action, struct audit_krule *rule, 1053 char *action, struct audit_krule *rule,
1114 int res) 1054 int res)
1115{ 1055{
@@ -1121,8 +1061,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1121 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1061 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1122 if (!ab) 1062 if (!ab)
1123 return; 1063 return;
1124 audit_log_format(ab, "auid=%u ses=%u", 1064 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
1125 from_kuid(&init_user_ns, loginuid), sessionid);
1126 if (sid) { 1065 if (sid) {
1127 char *ctx = NULL; 1066 char *ctx = NULL;
1128 u32 len; 1067 u32 len;
@@ -1144,6 +1083,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1144 * audit_receive_filter - apply all rules to the specified message type 1083 * audit_receive_filter - apply all rules to the specified message type
1145 * @type: audit message type 1084 * @type: audit message type
1146 * @pid: target pid for netlink audit messages 1085 * @pid: target pid for netlink audit messages
1086 * @uid: target uid for netlink audit messages
1147 * @seq: netlink audit message sequence (serial) number 1087 * @seq: netlink audit message sequence (serial) number
1148 * @data: payload data 1088 * @data: payload data
1149 * @datasz: size of payload data 1089 * @datasz: size of payload data
@@ -1151,8 +1091,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1151 * @sessionid: sessionid for netlink audit message 1091 * @sessionid: sessionid for netlink audit message
1152 * @sid: SE Linux Security ID of sender 1092 * @sid: SE Linux Security ID of sender
1153 */ 1093 */
1154int audit_receive_filter(int type, int pid, int seq, void *data, 1094int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1155 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) 1095 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
1156{ 1096{
1157 struct task_struct *tsk; 1097 struct task_struct *tsk;
1158 struct audit_netlink_list *dest; 1098 struct audit_netlink_list *dest;
@@ -1251,110 +1191,46 @@ int audit_comparator(u32 left, u32 op, u32 right)
1251 } 1191 }
1252} 1192}
1253 1193
1254int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) 1194/* Compare given dentry name with last component in given path,
1195 * return of 0 indicates a match. */
1196int audit_compare_dname_path(const char *dname, const char *path,
1197 int *dirlen)
1255{ 1198{
1256 switch (op) { 1199 int dlen, plen;
1257 case Audit_equal:
1258 return uid_eq(left, right);
1259 case Audit_not_equal:
1260 return !uid_eq(left, right);
1261 case Audit_lt:
1262 return uid_lt(left, right);
1263 case Audit_le:
1264 return uid_lte(left, right);
1265 case Audit_gt:
1266 return uid_gt(left, right);
1267 case Audit_ge:
1268 return uid_gte(left, right);
1269 case Audit_bitmask:
1270 case Audit_bittest:
1271 default:
1272 BUG();
1273 return 0;
1274 }
1275}
1276
1277int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
1278{
1279 switch (op) {
1280 case Audit_equal:
1281 return gid_eq(left, right);
1282 case Audit_not_equal:
1283 return !gid_eq(left, right);
1284 case Audit_lt:
1285 return gid_lt(left, right);
1286 case Audit_le:
1287 return gid_lte(left, right);
1288 case Audit_gt:
1289 return gid_gt(left, right);
1290 case Audit_ge:
1291 return gid_gte(left, right);
1292 case Audit_bitmask:
1293 case Audit_bittest:
1294 default:
1295 BUG();
1296 return 0;
1297 }
1298}
1299
1300/**
1301 * parent_len - find the length of the parent portion of a pathname
1302 * @path: pathname of which to determine length
1303 */
1304int parent_len(const char *path)
1305{
1306 int plen;
1307 const char *p; 1200 const char *p;
1308 1201
1309 plen = strlen(path); 1202 if (!dname || !path)
1203 return 1;
1310 1204
1311 if (plen == 0) 1205 dlen = strlen(dname);
1312 return plen; 1206 plen = strlen(path);
1207 if (plen < dlen)
1208 return 1;
1313 1209
1314 /* disregard trailing slashes */ 1210 /* disregard trailing slashes */
1315 p = path + plen - 1; 1211 p = path + plen - 1;
1316 while ((*p == '/') && (p > path)) 1212 while ((*p == '/') && (p > path))
1317 p--; 1213 p--;
1318 1214
1319 /* walk backward until we find the next slash or hit beginning */ 1215 /* find last path component */
1320 while ((*p != '/') && (p > path)) 1216 p = p - dlen + 1;
1321 p--; 1217 if (p < path)
1322
1323 /* did we find a slash? Then increment to include it in path */
1324 if (*p == '/')
1325 p++;
1326
1327 return p - path;
1328}
1329
1330/**
1331 * audit_compare_dname_path - compare given dentry name with last component in
1332 * given path. Return of 0 indicates a match.
1333 * @dname: dentry name that we're comparing
1334 * @path: full pathname that we're comparing
1335 * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
1336 * here indicates that we must compute this value.
1337 */
1338int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
1339{
1340 int dlen, pathlen;
1341 const char *p;
1342
1343 dlen = strlen(dname);
1344 pathlen = strlen(path);
1345 if (pathlen < dlen)
1346 return 1; 1218 return 1;
1219 else if (p > path) {
1220 if (*--p != '/')
1221 return 1;
1222 else
1223 p++;
1224 }
1347 1225
1348 parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; 1226 /* return length of path's directory component */
1349 if (pathlen - parentlen != dlen) 1227 if (dirlen)
1350 return 1; 1228 *dirlen = p - path;
1351
1352 p = path + parentlen;
1353
1354 return strncmp(p, dname, dlen); 1229 return strncmp(p, dname, dlen);
1355} 1230}
1356 1231
1357static int audit_filter_user_rules(struct audit_krule *rule, 1232static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1233 struct audit_krule *rule,
1358 enum audit_state *state) 1234 enum audit_state *state)
1359{ 1235{
1360 int i; 1236 int i;
@@ -1366,17 +1242,17 @@ static int audit_filter_user_rules(struct audit_krule *rule,
1366 1242
1367 switch (f->type) { 1243 switch (f->type) {
1368 case AUDIT_PID: 1244 case AUDIT_PID:
1369 result = audit_comparator(task_pid_vnr(current), f->op, f->val); 1245 result = audit_comparator(cb->creds.pid, f->op, f->val);
1370 break; 1246 break;
1371 case AUDIT_UID: 1247 case AUDIT_UID:
1372 result = audit_uid_comparator(current_uid(), f->op, f->uid); 1248 result = audit_comparator(cb->creds.uid, f->op, f->val);
1373 break; 1249 break;
1374 case AUDIT_GID: 1250 case AUDIT_GID:
1375 result = audit_gid_comparator(current_gid(), f->op, f->gid); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1376 break; 1252 break;
1377 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1378 result = audit_uid_comparator(audit_get_loginuid(current), 1254 result = audit_comparator(audit_get_loginuid(current),
1379 f->op, f->uid); 1255 f->op, f->val);
1380 break; 1256 break;
1381 case AUDIT_SUBJ_USER: 1257 case AUDIT_SUBJ_USER:
1382 case AUDIT_SUBJ_ROLE: 1258 case AUDIT_SUBJ_ROLE:
@@ -1404,7 +1280,7 @@ static int audit_filter_user_rules(struct audit_krule *rule,
1404 return 1; 1280 return 1;
1405} 1281}
1406 1282
1407int audit_filter_user(void) 1283int audit_filter_user(struct netlink_skb_parms *cb)
1408{ 1284{
1409 enum audit_state state = AUDIT_DISABLED; 1285 enum audit_state state = AUDIT_DISABLED;
1410 struct audit_entry *e; 1286 struct audit_entry *e;
@@ -1412,7 +1288,7 @@ int audit_filter_user(void)
1412 1288
1413 rcu_read_lock(); 1289 rcu_read_lock();
1414 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1290 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1415 if (audit_filter_user_rules(&e->rule, &state)) { 1291 if (audit_filter_user_rules(cb, &e->rule, &state)) {
1416 if (state == AUDIT_DISABLED) 1292 if (state == AUDIT_DISABLED)
1417 ret = 0; 1293 ret = 0;
1418 break; 1294 break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a..ce4b054acee 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/export.h> 51#include <linux/module.h>
52#include <linux/slab.h> 52#include <linux/slab.h>
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/socket.h> 54#include <linux/socket.h>
@@ -67,19 +67,15 @@
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
70#include <linux/compat.h>
71 70
72#include "audit.h" 71#include "audit.h"
73 72
74/* flags stating the success for a syscall */
75#define AUDITSC_INVALID 0
76#define AUDITSC_SUCCESS 1
77#define AUDITSC_FAILURE 2
78
79/* AUDIT_NAMES is the number of slots we reserve in the audit_context 73/* AUDIT_NAMES is the number of slots we reserve in the audit_context
80 * for saving names from getname(). If we get more names we will allocate 74 * for saving names from getname(). */
81 * a name dynamically and also add those to the list anchored by names_list. */ 75#define AUDIT_NAMES 20
82#define AUDIT_NAMES 5 76
77/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1
83 79
84/* no execve audit message should be longer than this (userspace limits) */ 80/* no execve audit message should be longer than this (userspace limits) */
85#define MAX_EXECVE_AUDIT_LEN 7500 81#define MAX_EXECVE_AUDIT_LEN 7500
@@ -103,29 +99,20 @@ struct audit_cap_data {
103 * we don't let putname() free it (instead we free all of the saved 99 * we don't let putname() free it (instead we free all of the saved
104 * pointers at syscall exit time). 100 * pointers at syscall exit time).
105 * 101 *
106 * Further, in fs/namei.c:path_lookup() we store the inode and device. 102 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
107 */
108struct audit_names { 103struct audit_names {
109 struct list_head list; /* audit_context->names_list */ 104 const char *name;
110 struct filename *name; 105 int name_len; /* number of name's characters to log */
111 unsigned long ino; 106 unsigned name_put; /* call __putname() for this name */
112 dev_t dev; 107 unsigned long ino;
113 umode_t mode; 108 dev_t dev;
114 kuid_t uid; 109 umode_t mode;
115 kgid_t gid; 110 uid_t uid;
116 dev_t rdev; 111 gid_t gid;
117 u32 osid; 112 dev_t rdev;
118 struct audit_cap_data fcap; 113 u32 osid;
119 unsigned int fcap_ver; 114 struct audit_cap_data fcap;
120 int name_len; /* number of name's characters to log */ 115 unsigned int fcap_ver;
121 unsigned char type; /* record type */
122 bool name_put; /* call __putname() for this name */
123 /*
124 * This was an allocated audit_names and not from the array of
125 * names allocated in the task audit context. Thus this name
126 * should be freed on syscall exit
127 */
128 bool should_free;
129}; 116};
130 117
131struct audit_aux_data { 118struct audit_aux_data {
@@ -148,8 +135,8 @@ struct audit_aux_data_execve {
148struct audit_aux_data_pids { 135struct audit_aux_data_pids {
149 struct audit_aux_data d; 136 struct audit_aux_data d;
150 pid_t target_pid[AUDIT_AUX_PIDS]; 137 pid_t target_pid[AUDIT_AUX_PIDS];
151 kuid_t target_auid[AUDIT_AUX_PIDS]; 138 uid_t target_auid[AUDIT_AUX_PIDS];
152 kuid_t target_uid[AUDIT_AUX_PIDS]; 139 uid_t target_uid[AUDIT_AUX_PIDS];
153 unsigned int target_sessionid[AUDIT_AUX_PIDS]; 140 unsigned int target_sessionid[AUDIT_AUX_PIDS];
154 u32 target_sid[AUDIT_AUX_PIDS]; 141 u32 target_sid[AUDIT_AUX_PIDS];
155 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; 142 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -187,33 +174,25 @@ struct audit_context {
187 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
188 u64 prio; 175 u64 prio;
189 int return_valid; /* return code is valid */ 176 int return_valid; /* return code is valid */
190 /* 177 int name_count;
191 * The names_list is the list of all audit_names collected during this 178 struct audit_names names[AUDIT_NAMES];
192 * syscall. The first AUDIT_NAMES entries in the names_list will
193 * actually be from the preallocated_names array for performance
194 * reasons. Except during allocation they should never be referenced
195 * through the preallocated_names array and should only be found/used
196 * by running the names_list.
197 */
198 struct audit_names preallocated_names[AUDIT_NAMES];
199 int name_count; /* total records in names_list */
200 struct list_head names_list; /* anchor for struct audit_names->list */
201 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
202 struct path pwd; 180 struct path pwd;
181 struct audit_context *previous; /* For nested syscalls */
203 struct audit_aux_data *aux; 182 struct audit_aux_data *aux;
204 struct audit_aux_data *aux_pids; 183 struct audit_aux_data *aux_pids;
205 struct sockaddr_storage *sockaddr; 184 struct sockaddr_storage *sockaddr;
206 size_t sockaddr_len; 185 size_t sockaddr_len;
207 /* Save things to print about task_struct */ 186 /* Save things to print about task_struct */
208 pid_t pid, ppid; 187 pid_t pid, ppid;
209 kuid_t uid, euid, suid, fsuid; 188 uid_t uid, euid, suid, fsuid;
210 kgid_t gid, egid, sgid, fsgid; 189 gid_t gid, egid, sgid, fsgid;
211 unsigned long personality; 190 unsigned long personality;
212 int arch; 191 int arch;
213 192
214 pid_t target_pid; 193 pid_t target_pid;
215 kuid_t target_auid; 194 uid_t target_auid;
216 kuid_t target_uid; 195 uid_t target_uid;
217 unsigned int target_sessionid; 196 unsigned int target_sessionid;
218 u32 target_sid; 197 u32 target_sid;
219 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
@@ -229,14 +208,14 @@ struct audit_context {
229 long args[6]; 208 long args[6];
230 } socketcall; 209 } socketcall;
231 struct { 210 struct {
232 kuid_t uid; 211 uid_t uid;
233 kgid_t gid; 212 gid_t gid;
234 umode_t mode; 213 mode_t mode;
235 u32 osid; 214 u32 osid;
236 int has_perm; 215 int has_perm;
237 uid_t perm_uid; 216 uid_t perm_uid;
238 gid_t perm_gid; 217 gid_t perm_gid;
239 umode_t perm_mode; 218 mode_t perm_mode;
240 unsigned long qbytes; 219 unsigned long qbytes;
241 } ipc; 220 } ipc;
242 struct { 221 struct {
@@ -255,7 +234,7 @@ struct audit_context {
255 } mq_sendrecv; 234 } mq_sendrecv;
256 struct { 235 struct {
257 int oflag; 236 int oflag;
258 umode_t mode; 237 mode_t mode;
259 struct mq_attr attr; 238 struct mq_attr attr;
260 } mq_open; 239 } mq_open;
261 struct { 240 struct {
@@ -326,21 +305,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
326 } 305 }
327} 306}
328 307
329static int audit_match_filetype(struct audit_context *ctx, int val) 308static int audit_match_filetype(struct audit_context *ctx, int which)
330{ 309{
331 struct audit_names *n; 310 unsigned index = which & ~S_IFMT;
332 umode_t mode = (umode_t)val; 311 mode_t mode = which & S_IFMT;
333 312
334 if (unlikely(!ctx)) 313 if (unlikely(!ctx))
335 return 0; 314 return 0;
336 315
337 list_for_each_entry(n, &ctx->names_list, list) { 316 if (index >= ctx->name_count)
338 if ((n->ino != -1) && 317 return 0;
339 ((n->mode & S_IFMT) == mode)) 318 if (ctx->names[index].ino == -1)
340 return 1; 319 return 0;
341 } 320 if ((ctx->names[index].mode ^ mode) & S_IFMT)
342 321 return 0;
343 return 0; 322 return 1;
344} 323}
345 324
346/* 325/*
@@ -462,126 +441,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
462 return 0; 441 return 0;
463} 442}
464 443
465static int audit_compare_uid(kuid_t uid,
466 struct audit_names *name,
467 struct audit_field *f,
468 struct audit_context *ctx)
469{
470 struct audit_names *n;
471 int rc;
472
473 if (name) {
474 rc = audit_uid_comparator(uid, f->op, name->uid);
475 if (rc)
476 return rc;
477 }
478
479 if (ctx) {
480 list_for_each_entry(n, &ctx->names_list, list) {
481 rc = audit_uid_comparator(uid, f->op, n->uid);
482 if (rc)
483 return rc;
484 }
485 }
486 return 0;
487}
488
489static int audit_compare_gid(kgid_t gid,
490 struct audit_names *name,
491 struct audit_field *f,
492 struct audit_context *ctx)
493{
494 struct audit_names *n;
495 int rc;
496
497 if (name) {
498 rc = audit_gid_comparator(gid, f->op, name->gid);
499 if (rc)
500 return rc;
501 }
502
503 if (ctx) {
504 list_for_each_entry(n, &ctx->names_list, list) {
505 rc = audit_gid_comparator(gid, f->op, n->gid);
506 if (rc)
507 return rc;
508 }
509 }
510 return 0;
511}
512
513static int audit_field_compare(struct task_struct *tsk,
514 const struct cred *cred,
515 struct audit_field *f,
516 struct audit_context *ctx,
517 struct audit_names *name)
518{
519 switch (f->val) {
520 /* process to file object comparisons */
521 case AUDIT_COMPARE_UID_TO_OBJ_UID:
522 return audit_compare_uid(cred->uid, name, f, ctx);
523 case AUDIT_COMPARE_GID_TO_OBJ_GID:
524 return audit_compare_gid(cred->gid, name, f, ctx);
525 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
526 return audit_compare_uid(cred->euid, name, f, ctx);
527 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
528 return audit_compare_gid(cred->egid, name, f, ctx);
529 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
530 return audit_compare_uid(tsk->loginuid, name, f, ctx);
531 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
532 return audit_compare_uid(cred->suid, name, f, ctx);
533 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
534 return audit_compare_gid(cred->sgid, name, f, ctx);
535 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
536 return audit_compare_uid(cred->fsuid, name, f, ctx);
537 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
538 return audit_compare_gid(cred->fsgid, name, f, ctx);
539 /* uid comparisons */
540 case AUDIT_COMPARE_UID_TO_AUID:
541 return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
542 case AUDIT_COMPARE_UID_TO_EUID:
543 return audit_uid_comparator(cred->uid, f->op, cred->euid);
544 case AUDIT_COMPARE_UID_TO_SUID:
545 return audit_uid_comparator(cred->uid, f->op, cred->suid);
546 case AUDIT_COMPARE_UID_TO_FSUID:
547 return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
548 /* auid comparisons */
549 case AUDIT_COMPARE_AUID_TO_EUID:
550 return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
551 case AUDIT_COMPARE_AUID_TO_SUID:
552 return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
553 case AUDIT_COMPARE_AUID_TO_FSUID:
554 return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
555 /* euid comparisons */
556 case AUDIT_COMPARE_EUID_TO_SUID:
557 return audit_uid_comparator(cred->euid, f->op, cred->suid);
558 case AUDIT_COMPARE_EUID_TO_FSUID:
559 return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
560 /* suid comparisons */
561 case AUDIT_COMPARE_SUID_TO_FSUID:
562 return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
563 /* gid comparisons */
564 case AUDIT_COMPARE_GID_TO_EGID:
565 return audit_gid_comparator(cred->gid, f->op, cred->egid);
566 case AUDIT_COMPARE_GID_TO_SGID:
567 return audit_gid_comparator(cred->gid, f->op, cred->sgid);
568 case AUDIT_COMPARE_GID_TO_FSGID:
569 return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
570 /* egid comparisons */
571 case AUDIT_COMPARE_EGID_TO_SGID:
572 return audit_gid_comparator(cred->egid, f->op, cred->sgid);
573 case AUDIT_COMPARE_EGID_TO_FSGID:
574 return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
575 /* sgid comparison */
576 case AUDIT_COMPARE_SGID_TO_FSGID:
577 return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
578 default:
579 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
580 return 0;
581 }
582 return 0;
583}
584
585/* Determine if any context name data matches a rule's watch data */ 444/* Determine if any context name data matches a rule's watch data */
586/* Compare a task_struct with an audit_rule. Return 1 on match, 0 445/* Compare a task_struct with an audit_rule. Return 1 on match, 0
587 * otherwise. 446 * otherwise.
@@ -598,14 +457,13 @@ static int audit_filter_rules(struct task_struct *tsk,
598 bool task_creation) 457 bool task_creation)
599{ 458{
600 const struct cred *cred; 459 const struct cred *cred;
601 int i, need_sid = 1; 460 int i, j, need_sid = 1;
602 u32 sid; 461 u32 sid;
603 462
604 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); 463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
605 464
606 for (i = 0; i < rule->field_count; i++) { 465 for (i = 0; i < rule->field_count; i++) {
607 struct audit_field *f = &rule->fields[i]; 466 struct audit_field *f = &rule->fields[i];
608 struct audit_names *n;
609 int result = 0; 467 int result = 0;
610 468
611 switch (f->type) { 469 switch (f->type) {
@@ -620,28 +478,28 @@ static int audit_filter_rules(struct task_struct *tsk,
620 } 478 }
621 break; 479 break;
622 case AUDIT_UID: 480 case AUDIT_UID:
623 result = audit_uid_comparator(cred->uid, f->op, f->uid); 481 result = audit_comparator(cred->uid, f->op, f->val);
624 break; 482 break;
625 case AUDIT_EUID: 483 case AUDIT_EUID:
626 result = audit_uid_comparator(cred->euid, f->op, f->uid); 484 result = audit_comparator(cred->euid, f->op, f->val);
627 break; 485 break;
628 case AUDIT_SUID: 486 case AUDIT_SUID:
629 result = audit_uid_comparator(cred->suid, f->op, f->uid); 487 result = audit_comparator(cred->suid, f->op, f->val);
630 break; 488 break;
631 case AUDIT_FSUID: 489 case AUDIT_FSUID:
632 result = audit_uid_comparator(cred->fsuid, f->op, f->uid); 490 result = audit_comparator(cred->fsuid, f->op, f->val);
633 break; 491 break;
634 case AUDIT_GID: 492 case AUDIT_GID:
635 result = audit_gid_comparator(cred->gid, f->op, f->gid); 493 result = audit_comparator(cred->gid, f->op, f->val);
636 break; 494 break;
637 case AUDIT_EGID: 495 case AUDIT_EGID:
638 result = audit_gid_comparator(cred->egid, f->op, f->gid); 496 result = audit_comparator(cred->egid, f->op, f->val);
639 break; 497 break;
640 case AUDIT_SGID: 498 case AUDIT_SGID:
641 result = audit_gid_comparator(cred->sgid, f->op, f->gid); 499 result = audit_comparator(cred->sgid, f->op, f->val);
642 break; 500 break;
643 case AUDIT_FSGID: 501 case AUDIT_FSGID:
644 result = audit_gid_comparator(cred->fsgid, f->op, f->gid); 502 result = audit_comparator(cred->fsgid, f->op, f->val);
645 break; 503 break;
646 case AUDIT_PERS: 504 case AUDIT_PERS:
647 result = audit_comparator(tsk->personality, f->op, f->val); 505 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -664,14 +522,12 @@ static int audit_filter_rules(struct task_struct *tsk,
664 } 522 }
665 break; 523 break;
666 case AUDIT_DEVMAJOR: 524 case AUDIT_DEVMAJOR:
667 if (name) { 525 if (name)
668 if (audit_comparator(MAJOR(name->dev), f->op, f->val) || 526 result = audit_comparator(MAJOR(name->dev),
669 audit_comparator(MAJOR(name->rdev), f->op, f->val)) 527 f->op, f->val);
670 ++result; 528 else if (ctx) {
671 } else if (ctx) { 529 for (j = 0; j < ctx->name_count; j++) {
672 list_for_each_entry(n, &ctx->names_list, list) { 530 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
673 if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
674 audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
675 ++result; 531 ++result;
676 break; 532 break;
677 } 533 }
@@ -679,14 +535,12 @@ static int audit_filter_rules(struct task_struct *tsk,
679 } 535 }
680 break; 536 break;
681 case AUDIT_DEVMINOR: 537 case AUDIT_DEVMINOR:
682 if (name) { 538 if (name)
683 if (audit_comparator(MINOR(name->dev), f->op, f->val) || 539 result = audit_comparator(MINOR(name->dev),
684 audit_comparator(MINOR(name->rdev), f->op, f->val)) 540 f->op, f->val);
685 ++result; 541 else if (ctx) {
686 } else if (ctx) { 542 for (j = 0; j < ctx->name_count; j++) {
687 list_for_each_entry(n, &ctx->names_list, list) { 543 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
688 if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
689 audit_comparator(MINOR(n->rdev), f->op, f->val)) {
690 ++result; 544 ++result;
691 break; 545 break;
692 } 546 }
@@ -697,32 +551,8 @@ static int audit_filter_rules(struct task_struct *tsk,
697 if (name) 551 if (name)
698 result = (name->ino == f->val); 552 result = (name->ino == f->val);
699 else if (ctx) { 553 else if (ctx) {
700 list_for_each_entry(n, &ctx->names_list, list) { 554 for (j = 0; j < ctx->name_count; j++) {
701 if (audit_comparator(n->ino, f->op, f->val)) { 555 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
702 ++result;
703 break;
704 }
705 }
706 }
707 break;
708 case AUDIT_OBJ_UID:
709 if (name) {
710 result = audit_uid_comparator(name->uid, f->op, f->uid);
711 } else if (ctx) {
712 list_for_each_entry(n, &ctx->names_list, list) {
713 if (audit_uid_comparator(n->uid, f->op, f->uid)) {
714 ++result;
715 break;
716 }
717 }
718 }
719 break;
720 case AUDIT_OBJ_GID:
721 if (name) {
722 result = audit_gid_comparator(name->gid, f->op, f->gid);
723 } else if (ctx) {
724 list_for_each_entry(n, &ctx->names_list, list) {
725 if (audit_gid_comparator(n->gid, f->op, f->gid)) {
726 ++result; 556 ++result;
727 break; 557 break;
728 } 558 }
@@ -740,7 +570,7 @@ static int audit_filter_rules(struct task_struct *tsk,
740 case AUDIT_LOGINUID: 570 case AUDIT_LOGINUID:
741 result = 0; 571 result = 0;
742 if (ctx) 572 if (ctx)
743 result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); 573 result = audit_comparator(tsk->loginuid, f->op, f->val);
744 break; 574 break;
745 case AUDIT_SUBJ_USER: 575 case AUDIT_SUBJ_USER:
746 case AUDIT_SUBJ_ROLE: 576 case AUDIT_SUBJ_ROLE:
@@ -777,10 +607,11 @@ static int audit_filter_rules(struct task_struct *tsk,
777 name->osid, f->type, f->op, 607 name->osid, f->type, f->op,
778 f->lsm_rule, ctx); 608 f->lsm_rule, ctx);
779 } else if (ctx) { 609 } else if (ctx) {
780 list_for_each_entry(n, &ctx->names_list, list) { 610 for (j = 0; j < ctx->name_count; j++) {
781 if (security_audit_rule_match(n->osid, f->type, 611 if (security_audit_rule_match(
782 f->op, f->lsm_rule, 612 ctx->names[j].osid,
783 ctx)) { 613 f->type, f->op,
614 f->lsm_rule, ctx)) {
784 ++result; 615 ++result;
785 break; 616 break;
786 } 617 }
@@ -812,10 +643,8 @@ static int audit_filter_rules(struct task_struct *tsk,
812 case AUDIT_FILETYPE: 643 case AUDIT_FILETYPE:
813 result = audit_match_filetype(ctx, f->val); 644 result = audit_match_filetype(ctx, f->val);
814 break; 645 break;
815 case AUDIT_FIELD_COMPARE:
816 result = audit_field_compare(tsk, cred, f, ctx, name);
817 break;
818 } 646 }
647
819 if (!result) 648 if (!result)
820 return 0; 649 return 0;
821 } 650 }
@@ -893,53 +722,40 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
893 return AUDIT_BUILD_CONTEXT; 722 return AUDIT_BUILD_CONTEXT;
894} 723}
895 724
896/* 725/* At syscall exit time, this filter is called if any audit_names[] have been
897 * Given an audit_name check the inode hash table to see if they match.
898 * Called holding the rcu read lock to protect the use of audit_inode_hash
899 */
900static int audit_filter_inode_name(struct task_struct *tsk,
901 struct audit_names *n,
902 struct audit_context *ctx) {
903 int word, bit;
904 int h = audit_hash_ino((u32)n->ino);
905 struct list_head *list = &audit_inode_hash[h];
906 struct audit_entry *e;
907 enum audit_state state;
908
909 word = AUDIT_WORD(ctx->major);
910 bit = AUDIT_BIT(ctx->major);
911
912 if (list_empty(list))
913 return 0;
914
915 list_for_each_entry_rcu(e, list, list) {
916 if ((e->rule.mask[word] & bit) == bit &&
917 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
918 ctx->current_state = state;
919 return 1;
920 }
921 }
922
923 return 0;
924}
925
926/* At syscall exit time, this filter is called if any audit_names have been
927 * collected during syscall processing. We only check rules in sublists at hash 726 * collected during syscall processing. We only check rules in sublists at hash
928 * buckets applicable to the inode numbers in audit_names. 727 * buckets applicable to the inode numbers in audit_names[].
929 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 728 * Regarding audit_state, same rules apply as for audit_filter_syscall().
930 */ 729 */
931void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) 730void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
932{ 731{
933 struct audit_names *n; 732 int i;
733 struct audit_entry *e;
734 enum audit_state state;
934 735
935 if (audit_pid && tsk->tgid == audit_pid) 736 if (audit_pid && tsk->tgid == audit_pid)
936 return; 737 return;
937 738
938 rcu_read_lock(); 739 rcu_read_lock();
740 for (i = 0; i < ctx->name_count; i++) {
741 int word = AUDIT_WORD(ctx->major);
742 int bit = AUDIT_BIT(ctx->major);
743 struct audit_names *n = &ctx->names[i];
744 int h = audit_hash_ino((u32)n->ino);
745 struct list_head *list = &audit_inode_hash[h];
939 746
940 list_for_each_entry(n, &ctx->names_list, list) { 747 if (list_empty(list))
941 if (audit_filter_inode_name(tsk, n, ctx)) 748 continue;
942 break; 749
750 list_for_each_entry_rcu(e, list, list) {
751 if ((e->rule.mask[word] & bit) == bit &&
752 audit_filter_rules(tsk, &e->rule, ctx, n,
753 &state, false)) {
754 rcu_read_unlock();
755 ctx->current_state = state;
756 return;
757 }
758 }
943 } 759 }
944 rcu_read_unlock(); 760 rcu_read_unlock();
945} 761}
@@ -950,7 +766,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
950{ 766{
951 struct audit_context *context = tsk->audit_context; 767 struct audit_context *context = tsk->audit_context;
952 768
953 if (!context) 769 if (likely(!context))
954 return NULL; 770 return NULL;
955 context->return_valid = return_valid; 771 context->return_valid = return_valid;
956 772
@@ -983,7 +799,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
983 799
984static inline void audit_free_names(struct audit_context *context) 800static inline void audit_free_names(struct audit_context *context)
985{ 801{
986 struct audit_names *n, *next; 802 int i;
987 803
988#if AUDIT_DEBUG == 2 804#if AUDIT_DEBUG == 2
989 if (context->put_count + context->ino_count != context->name_count) { 805 if (context->put_count + context->ino_count != context->name_count) {
@@ -994,9 +810,10 @@ static inline void audit_free_names(struct audit_context *context)
994 context->serial, context->major, context->in_syscall, 810 context->serial, context->major, context->in_syscall,
995 context->name_count, context->put_count, 811 context->name_count, context->put_count,
996 context->ino_count); 812 context->ino_count);
997 list_for_each_entry(n, &context->names_list, list) { 813 for (i = 0; i < context->name_count; i++) {
998 printk(KERN_ERR "names[%d] = %p = %s\n", i, 814 printk(KERN_ERR "names[%d] = %p = %s\n", i,
999 n->name, n->name->name ?: "(null)"); 815 context->names[i].name,
816 context->names[i].name ?: "(null)");
1000 } 817 }
1001 dump_stack(); 818 dump_stack();
1002 return; 819 return;
@@ -1007,12 +824,9 @@ static inline void audit_free_names(struct audit_context *context)
1007 context->ino_count = 0; 824 context->ino_count = 0;
1008#endif 825#endif
1009 826
1010 list_for_each_entry_safe(n, next, &context->names_list, list) { 827 for (i = 0; i < context->name_count; i++) {
1011 list_del(&n->list); 828 if (context->names[i].name && context->names[i].name_put)
1012 if (n->name && n->name_put) 829 __putname(context->names[i].name);
1013 __putname(n->name);
1014 if (n->should_free)
1015 kfree(n);
1016 } 830 }
1017 context->name_count = 0; 831 context->name_count = 0;
1018 path_put(&context->pwd); 832 path_put(&context->pwd);
@@ -1050,7 +864,6 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
1050 return NULL; 864 return NULL;
1051 audit_zero_context(context, state); 865 audit_zero_context(context, state);
1052 INIT_LIST_HEAD(&context->killed_trees); 866 INIT_LIST_HEAD(&context->killed_trees);
1053 INIT_LIST_HEAD(&context->names_list);
1054 return context; 867 return context;
1055} 868}
1056 869
@@ -1073,7 +886,7 @@ int audit_alloc(struct task_struct *tsk)
1073 return 0; /* Return if not auditing. */ 886 return 0; /* Return if not auditing. */
1074 887
1075 state = audit_filter_task(tsk, &key); 888 state = audit_filter_task(tsk, &key);
1076 if (state == AUDIT_DISABLED) 889 if (likely(state == AUDIT_DISABLED))
1077 return 0; 890 return 0;
1078 891
1079 if (!(context = audit_alloc_context(state))) { 892 if (!(context = audit_alloc_context(state))) {
@@ -1090,13 +903,29 @@ int audit_alloc(struct task_struct *tsk)
1090 903
1091static inline void audit_free_context(struct audit_context *context) 904static inline void audit_free_context(struct audit_context *context)
1092{ 905{
1093 audit_free_names(context); 906 struct audit_context *previous;
1094 unroll_tree_refs(context, NULL, 0); 907 int count = 0;
1095 free_tree_refs(context); 908
1096 audit_free_aux(context); 909 do {
1097 kfree(context->filterkey); 910 previous = context->previous;
1098 kfree(context->sockaddr); 911 if (previous || (count && count < 10)) {
1099 kfree(context); 912 ++count;
913 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
914 " freeing multiple contexts (%d)\n",
915 context->serial, context->major,
916 context->name_count, count);
917 }
918 audit_free_names(context);
919 unroll_tree_refs(context, NULL, 0);
920 free_tree_refs(context);
921 audit_free_aux(context);
922 kfree(context->filterkey);
923 kfree(context->sockaddr);
924 kfree(context);
925 context = previous;
926 } while (context);
927 if (count >= 10)
928 printk(KERN_ERR "audit: freed %d contexts\n", count);
1100} 929}
1101 930
1102void audit_log_task_context(struct audit_buffer *ab) 931void audit_log_task_context(struct audit_buffer *ab)
@@ -1128,43 +957,13 @@ error_path:
1128 957
1129EXPORT_SYMBOL(audit_log_task_context); 958EXPORT_SYMBOL(audit_log_task_context);
1130 959
1131void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 960static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1132{ 961{
1133 const struct cred *cred;
1134 char name[sizeof(tsk->comm)]; 962 char name[sizeof(tsk->comm)];
1135 struct mm_struct *mm = tsk->mm; 963 struct mm_struct *mm = tsk->mm;
1136 char *tty; 964 struct vm_area_struct *vma;
1137
1138 if (!ab)
1139 return;
1140 965
1141 /* tsk == current */ 966 /* tsk == current */
1142 cred = current_cred();
1143
1144 spin_lock_irq(&tsk->sighand->siglock);
1145 if (tsk->signal && tsk->signal->tty)
1146 tty = tsk->signal->tty->name;
1147 else
1148 tty = "(none)";
1149 spin_unlock_irq(&tsk->sighand->siglock);
1150
1151
1152 audit_log_format(ab,
1153 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1154 " euid=%u suid=%u fsuid=%u"
1155 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
1156 sys_getppid(),
1157 tsk->pid,
1158 from_kuid(&init_user_ns, tsk->loginuid),
1159 from_kuid(&init_user_ns, cred->uid),
1160 from_kgid(&init_user_ns, cred->gid),
1161 from_kuid(&init_user_ns, cred->euid),
1162 from_kuid(&init_user_ns, cred->suid),
1163 from_kuid(&init_user_ns, cred->fsuid),
1164 from_kgid(&init_user_ns, cred->egid),
1165 from_kgid(&init_user_ns, cred->sgid),
1166 from_kgid(&init_user_ns, cred->fsgid),
1167 tsk->sessionid, tty);
1168 967
1169 get_task_comm(name, tsk); 968 get_task_comm(name, tsk);
1170 audit_log_format(ab, " comm="); 969 audit_log_format(ab, " comm=");
@@ -1172,17 +971,23 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1172 971
1173 if (mm) { 972 if (mm) {
1174 down_read(&mm->mmap_sem); 973 down_read(&mm->mmap_sem);
1175 if (mm->exe_file) 974 vma = mm->mmap;
1176 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); 975 while (vma) {
976 if ((vma->vm_flags & VM_EXECUTABLE) &&
977 vma->vm_file) {
978 audit_log_d_path(ab, "exe=",
979 &vma->vm_file->f_path);
980 break;
981 }
982 vma = vma->vm_next;
983 }
1177 up_read(&mm->mmap_sem); 984 up_read(&mm->mmap_sem);
1178 } 985 }
1179 audit_log_task_context(ab); 986 audit_log_task_context(ab);
1180} 987}
1181 988
1182EXPORT_SYMBOL(audit_log_task_info);
1183
1184static int audit_log_pid_context(struct audit_context *context, pid_t pid, 989static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1185 kuid_t auid, kuid_t uid, unsigned int sessionid, 990 uid_t auid, uid_t uid, unsigned int sessionid,
1186 u32 sid, char *comm) 991 u32 sid, char *comm)
1187{ 992{
1188 struct audit_buffer *ab; 993 struct audit_buffer *ab;
@@ -1194,9 +999,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1194 if (!ab) 999 if (!ab)
1195 return rc; 1000 return rc;
1196 1001
1197 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, 1002 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
1198 from_kuid(&init_user_ns, auid), 1003 uid, sessionid);
1199 from_kuid(&init_user_ns, uid), sessionid);
1200 if (security_secid_to_secctx(sid, &ctx, &len)) { 1004 if (security_secid_to_secctx(sid, &ctx, &len)) {
1201 audit_log_format(ab, " obj=(none)"); 1005 audit_log_format(ab, " obj=(none)");
1202 rc = 1; 1006 rc = 1;
@@ -1362,8 +1166,8 @@ static void audit_log_execve_info(struct audit_context *context,
1362 struct audit_buffer **ab, 1166 struct audit_buffer **ab,
1363 struct audit_aux_data_execve *axi) 1167 struct audit_aux_data_execve *axi)
1364{ 1168{
1365 int i, len; 1169 int i;
1366 size_t len_sent = 0; 1170 size_t len, len_sent = 0;
1367 const char __user *p; 1171 const char __user *p;
1368 char *buf; 1172 char *buf;
1369 1173
@@ -1445,10 +1249,8 @@ static void show_special(struct audit_context *context, int *call_panic)
1445 case AUDIT_IPC: { 1249 case AUDIT_IPC: {
1446 u32 osid = context->ipc.osid; 1250 u32 osid = context->ipc.osid;
1447 1251
1448 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", 1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
1449 from_kuid(&init_user_ns, context->ipc.uid), 1253 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1450 from_kgid(&init_user_ns, context->ipc.gid),
1451 context->ipc.mode);
1452 if (osid) { 1254 if (osid) {
1453 char *ctx = NULL; 1255 char *ctx = NULL;
1454 u32 len; 1256 u32 len;
@@ -1464,19 +1266,19 @@ static void show_special(struct audit_context *context, int *call_panic)
1464 audit_log_end(ab); 1266 audit_log_end(ab);
1465 ab = audit_log_start(context, GFP_KERNEL, 1267 ab = audit_log_start(context, GFP_KERNEL,
1466 AUDIT_IPC_SET_PERM); 1268 AUDIT_IPC_SET_PERM);
1467 if (unlikely(!ab))
1468 return;
1469 audit_log_format(ab, 1269 audit_log_format(ab,
1470 "qbytes=%lx ouid=%u ogid=%u mode=%#ho", 1270 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1471 context->ipc.qbytes, 1271 context->ipc.qbytes,
1472 context->ipc.perm_uid, 1272 context->ipc.perm_uid,
1473 context->ipc.perm_gid, 1273 context->ipc.perm_gid,
1474 context->ipc.perm_mode); 1274 context->ipc.perm_mode);
1275 if (!ab)
1276 return;
1475 } 1277 }
1476 break; } 1278 break; }
1477 case AUDIT_MQ_OPEN: { 1279 case AUDIT_MQ_OPEN: {
1478 audit_log_format(ab, 1280 audit_log_format(ab,
1479 "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " 1281 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1480 "mq_msgsize=%ld mq_curmsgs=%ld", 1282 "mq_msgsize=%ld mq_curmsgs=%ld",
1481 context->mq_open.oflag, context->mq_open.mode, 1283 context->mq_open.oflag, context->mq_open.mode,
1482 context->mq_open.attr.mq_flags, 1284 context->mq_open.attr.mq_flags,
@@ -1522,76 +1324,27 @@ static void show_special(struct audit_context *context, int *call_panic)
1522 audit_log_end(ab); 1324 audit_log_end(ab);
1523} 1325}
1524 1326
1525static void audit_log_name(struct audit_context *context, struct audit_names *n,
1526 int record_num, int *call_panic)
1527{
1528 struct audit_buffer *ab;
1529 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1530 if (!ab)
1531 return; /* audit_panic has been called */
1532
1533 audit_log_format(ab, "item=%d", record_num);
1534
1535 if (n->name) {
1536 switch (n->name_len) {
1537 case AUDIT_NAME_FULL:
1538 /* log the full path */
1539 audit_log_format(ab, " name=");
1540 audit_log_untrustedstring(ab, n->name->name);
1541 break;
1542 case 0:
1543 /* name was specified as a relative path and the
1544 * directory component is the cwd */
1545 audit_log_d_path(ab, " name=", &context->pwd);
1546 break;
1547 default:
1548 /* log the name's directory component */
1549 audit_log_format(ab, " name=");
1550 audit_log_n_untrustedstring(ab, n->name->name,
1551 n->name_len);
1552 }
1553 } else
1554 audit_log_format(ab, " name=(null)");
1555
1556 if (n->ino != (unsigned long)-1) {
1557 audit_log_format(ab, " inode=%lu"
1558 " dev=%02x:%02x mode=%#ho"
1559 " ouid=%u ogid=%u rdev=%02x:%02x",
1560 n->ino,
1561 MAJOR(n->dev),
1562 MINOR(n->dev),
1563 n->mode,
1564 from_kuid(&init_user_ns, n->uid),
1565 from_kgid(&init_user_ns, n->gid),
1566 MAJOR(n->rdev),
1567 MINOR(n->rdev));
1568 }
1569 if (n->osid != 0) {
1570 char *ctx = NULL;
1571 u32 len;
1572 if (security_secid_to_secctx(
1573 n->osid, &ctx, &len)) {
1574 audit_log_format(ab, " osid=%u", n->osid);
1575 *call_panic = 2;
1576 } else {
1577 audit_log_format(ab, " obj=%s", ctx);
1578 security_release_secctx(ctx, len);
1579 }
1580 }
1581
1582 audit_log_fcaps(ab, n);
1583
1584 audit_log_end(ab);
1585}
1586
1587static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1327static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1588{ 1328{
1329 const struct cred *cred;
1589 int i, call_panic = 0; 1330 int i, call_panic = 0;
1590 struct audit_buffer *ab; 1331 struct audit_buffer *ab;
1591 struct audit_aux_data *aux; 1332 struct audit_aux_data *aux;
1592 struct audit_names *n; 1333 const char *tty;
1593 1334
1594 /* tsk == current */ 1335 /* tsk == current */
1336 context->pid = tsk->pid;
1337 if (!context->ppid)
1338 context->ppid = sys_getppid();
1339 cred = current_cred();
1340 context->uid = cred->uid;
1341 context->gid = cred->gid;
1342 context->euid = cred->euid;
1343 context->suid = cred->suid;
1344 context->fsuid = cred->fsuid;
1345 context->egid = cred->egid;
1346 context->sgid = cred->sgid;
1347 context->fsgid = cred->fsgid;
1595 context->personality = tsk->personality; 1348 context->personality = tsk->personality;
1596 1349
1597 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1350 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1606,13 +1359,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1606 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1359 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1607 context->return_code); 1360 context->return_code);
1608 1361
1362 spin_lock_irq(&tsk->sighand->siglock);
1363 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1364 tty = tsk->signal->tty->name;
1365 else
1366 tty = "(none)";
1367 spin_unlock_irq(&tsk->sighand->siglock);
1368
1609 audit_log_format(ab, 1369 audit_log_format(ab,
1610 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", 1370 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1611 context->argv[0], 1371 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
1612 context->argv[1], 1372 " euid=%u suid=%u fsuid=%u"
1613 context->argv[2], 1373 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1614 context->argv[3], 1374 context->argv[0],
1615 context->name_count); 1375 context->argv[1],
1376 context->argv[2],
1377 context->argv[3],
1378 context->name_count,
1379 context->ppid,
1380 context->pid,
1381 tsk->loginuid,
1382 context->uid,
1383 context->gid,
1384 context->euid, context->suid, context->fsuid,
1385 context->egid, context->sgid, context->fsgid, tty,
1386 tsk->sessionid);
1387
1616 1388
1617 audit_log_task_info(ab, tsk); 1389 audit_log_task_info(ab, tsk);
1618 audit_log_key(ab, context->filterkey); 1390 audit_log_key(ab, context->filterkey);
@@ -1694,14 +1466,70 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1694 if (context->pwd.dentry && context->pwd.mnt) { 1466 if (context->pwd.dentry && context->pwd.mnt) {
1695 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); 1467 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
1696 if (ab) { 1468 if (ab) {
1697 audit_log_d_path(ab, " cwd=", &context->pwd); 1469 audit_log_d_path(ab, "cwd=", &context->pwd);
1698 audit_log_end(ab); 1470 audit_log_end(ab);
1699 } 1471 }
1700 } 1472 }
1473 for (i = 0; i < context->name_count; i++) {
1474 struct audit_names *n = &context->names[i];
1475
1476 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1477 if (!ab)
1478 continue; /* audit_panic has been called */
1479
1480 audit_log_format(ab, "item=%d", i);
1481
1482 if (n->name) {
1483 switch(n->name_len) {
1484 case AUDIT_NAME_FULL:
1485 /* log the full path */
1486 audit_log_format(ab, " name=");
1487 audit_log_untrustedstring(ab, n->name);
1488 break;
1489 case 0:
1490 /* name was specified as a relative path and the
1491 * directory component is the cwd */
1492 audit_log_d_path(ab, "name=", &context->pwd);
1493 break;
1494 default:
1495 /* log the name's directory component */
1496 audit_log_format(ab, " name=");
1497 audit_log_n_untrustedstring(ab, n->name,
1498 n->name_len);
1499 }
1500 } else
1501 audit_log_format(ab, " name=(null)");
1502
1503 if (n->ino != (unsigned long)-1) {
1504 audit_log_format(ab, " inode=%lu"
1505 " dev=%02x:%02x mode=%#o"
1506 " ouid=%u ogid=%u rdev=%02x:%02x",
1507 n->ino,
1508 MAJOR(n->dev),
1509 MINOR(n->dev),
1510 n->mode,
1511 n->uid,
1512 n->gid,
1513 MAJOR(n->rdev),
1514 MINOR(n->rdev));
1515 }
1516 if (n->osid != 0) {
1517 char *ctx = NULL;
1518 u32 len;
1519 if (security_secid_to_secctx(
1520 n->osid, &ctx, &len)) {
1521 audit_log_format(ab, " osid=%u", n->osid);
1522 call_panic = 2;
1523 } else {
1524 audit_log_format(ab, " obj=%s", ctx);
1525 security_release_secctx(ctx, len);
1526 }
1527 }
1528
1529 audit_log_fcaps(ab, n);
1701 1530
1702 i = 0; 1531 audit_log_end(ab);
1703 list_for_each_entry(n, &context->names_list, list) 1532 }
1704 audit_log_name(context, n, i++, &call_panic);
1705 1533
1706 /* Send end of event record to help user space know we are finished */ 1534 /* Send end of event record to help user space know we are finished */
1707 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1535 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1717,12 +1545,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1717 * 1545 *
1718 * Called from copy_process and do_exit 1546 * Called from copy_process and do_exit
1719 */ 1547 */
1720void __audit_free(struct task_struct *tsk) 1548void audit_free(struct task_struct *tsk)
1721{ 1549{
1722 struct audit_context *context; 1550 struct audit_context *context;
1723 1551
1724 context = audit_get_context(tsk, 0, 0); 1552 context = audit_get_context(tsk, 0, 0);
1725 if (!context) 1553 if (likely(!context))
1726 return; 1554 return;
1727 1555
1728 /* Check for system calls that do not go through the exit 1556 /* Check for system calls that do not go through the exit
@@ -1755,7 +1583,7 @@ void __audit_free(struct task_struct *tsk)
1755 * will only be written if another part of the kernel requests that it 1583 * will only be written if another part of the kernel requests that it
1756 * be written). 1584 * be written).
1757 */ 1585 */
1758void __audit_syscall_entry(int arch, int major, 1586void audit_syscall_entry(int arch, int major,
1759 unsigned long a1, unsigned long a2, 1587 unsigned long a1, unsigned long a2,
1760 unsigned long a3, unsigned long a4) 1588 unsigned long a3, unsigned long a4)
1761{ 1589{
@@ -1763,9 +1591,45 @@ void __audit_syscall_entry(int arch, int major,
1763 struct audit_context *context = tsk->audit_context; 1591 struct audit_context *context = tsk->audit_context;
1764 enum audit_state state; 1592 enum audit_state state;
1765 1593
1766 if (!context) 1594 if (unlikely(!context))
1767 return; 1595 return;
1768 1596
1597 /*
1598 * This happens only on certain architectures that make system
1599 * calls in kernel_thread via the entry.S interface, instead of
1600 * with direct calls. (If you are porting to a new
1601 * architecture, hitting this condition can indicate that you
1602 * got the _exit/_leave calls backward in entry.S.)
1603 *
1604 * i386 no
1605 * x86_64 no
1606 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
1607 *
1608 * This also happens with vm86 emulation in a non-nested manner
1609 * (entries without exits), so this case must be caught.
1610 */
1611 if (context->in_syscall) {
1612 struct audit_context *newctx;
1613
1614#if AUDIT_DEBUG
1615 printk(KERN_ERR
1616 "audit(:%d) pid=%d in syscall=%d;"
1617 " entering syscall=%d\n",
1618 context->serial, tsk->pid, context->major, major);
1619#endif
1620 newctx = audit_alloc_context(context->state);
1621 if (newctx) {
1622 newctx->previous = context;
1623 context = newctx;
1624 tsk->audit_context = newctx;
1625 } else {
1626 /* If we can't alloc a new context, the best we
1627 * can do is to leak memory (any pending putname
1628 * will be lost). The only other alternative is
1629 * to abandon auditing. */
1630 audit_zero_context(context, context->state);
1631 }
1632 }
1769 BUG_ON(context->in_syscall || context->name_count); 1633 BUG_ON(context->in_syscall || context->name_count);
1770 1634
1771 if (!audit_enabled) 1635 if (!audit_enabled)
@@ -1784,7 +1648,7 @@ void __audit_syscall_entry(int arch, int major,
1784 context->prio = 0; 1648 context->prio = 0;
1785 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1649 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1786 } 1650 }
1787 if (state == AUDIT_DISABLED) 1651 if (likely(state == AUDIT_DISABLED))
1788 return; 1652 return;
1789 1653
1790 context->serial = 0; 1654 context->serial = 0;
@@ -1794,29 +1658,45 @@ void __audit_syscall_entry(int arch, int major,
1794 context->ppid = 0; 1658 context->ppid = 0;
1795} 1659}
1796 1660
1661void audit_finish_fork(struct task_struct *child)
1662{
1663 struct audit_context *ctx = current->audit_context;
1664 struct audit_context *p = child->audit_context;
1665 if (!p || !ctx)
1666 return;
1667 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1668 return;
1669 p->arch = ctx->arch;
1670 p->major = ctx->major;
1671 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1672 p->ctime = ctx->ctime;
1673 p->dummy = ctx->dummy;
1674 p->in_syscall = ctx->in_syscall;
1675 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1676 p->ppid = current->pid;
1677 p->prio = ctx->prio;
1678 p->current_state = ctx->current_state;
1679}
1680
1797/** 1681/**
1798 * audit_syscall_exit - deallocate audit context after a system call 1682 * audit_syscall_exit - deallocate audit context after a system call
1799 * @success: success value of the syscall 1683 * @valid: success/failure flag
1800 * @return_code: return value of the syscall 1684 * @return_code: syscall return value
1801 * 1685 *
1802 * Tear down after system call. If the audit context has been marked as 1686 * Tear down after system call. If the audit context has been marked as
1803 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 1687 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1804 * filtering, or because some other part of the kernel wrote an audit 1688 * filtering, or because some other part of the kernel write an audit
1805 * message), then write out the syscall information. In call cases, 1689 * message), then write out the syscall information. In call cases,
1806 * free the names stored from getname(). 1690 * free the names stored from getname().
1807 */ 1691 */
1808void __audit_syscall_exit(int success, long return_code) 1692void audit_syscall_exit(int valid, long return_code)
1809{ 1693{
1810 struct task_struct *tsk = current; 1694 struct task_struct *tsk = current;
1811 struct audit_context *context; 1695 struct audit_context *context;
1812 1696
1813 if (success) 1697 context = audit_get_context(tsk, valid, return_code);
1814 success = AUDITSC_SUCCESS;
1815 else
1816 success = AUDITSC_FAILURE;
1817 1698
1818 context = audit_get_context(tsk, success, return_code); 1699 if (likely(!context))
1819 if (!context)
1820 return; 1700 return;
1821 1701
1822 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1702 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1828,21 +1708,28 @@ void __audit_syscall_exit(int success, long return_code)
1828 if (!list_empty(&context->killed_trees)) 1708 if (!list_empty(&context->killed_trees))
1829 audit_kill_trees(&context->killed_trees); 1709 audit_kill_trees(&context->killed_trees);
1830 1710
1831 audit_free_names(context); 1711 if (context->previous) {
1832 unroll_tree_refs(context, NULL, 0); 1712 struct audit_context *new_context = context->previous;
1833 audit_free_aux(context); 1713 context->previous = NULL;
1834 context->aux = NULL; 1714 audit_free_context(context);
1835 context->aux_pids = NULL; 1715 tsk->audit_context = new_context;
1836 context->target_pid = 0; 1716 } else {
1837 context->target_sid = 0; 1717 audit_free_names(context);
1838 context->sockaddr_len = 0; 1718 unroll_tree_refs(context, NULL, 0);
1839 context->type = 0; 1719 audit_free_aux(context);
1840 context->fds[0] = -1; 1720 context->aux = NULL;
1841 if (context->state != AUDIT_RECORD_CONTEXT) { 1721 context->aux_pids = NULL;
1842 kfree(context->filterkey); 1722 context->target_pid = 0;
1843 context->filterkey = NULL; 1723 context->target_sid = 0;
1724 context->sockaddr_len = 0;
1725 context->type = 0;
1726 context->fds[0] = -1;
1727 if (context->state != AUDIT_RECORD_CONTEXT) {
1728 kfree(context->filterkey);
1729 context->filterkey = NULL;
1730 }
1731 tsk->audit_context = context;
1844 } 1732 }
1845 tsk->audit_context = context;
1846} 1733}
1847 1734
1848static inline void handle_one(const struct inode *inode) 1735static inline void handle_one(const struct inode *inode)
@@ -1934,55 +1821,6 @@ retry:
1934#endif 1821#endif
1935} 1822}
1936 1823
1937static struct audit_names *audit_alloc_name(struct audit_context *context,
1938 unsigned char type)
1939{
1940 struct audit_names *aname;
1941
1942 if (context->name_count < AUDIT_NAMES) {
1943 aname = &context->preallocated_names[context->name_count];
1944 memset(aname, 0, sizeof(*aname));
1945 } else {
1946 aname = kzalloc(sizeof(*aname), GFP_NOFS);
1947 if (!aname)
1948 return NULL;
1949 aname->should_free = true;
1950 }
1951
1952 aname->ino = (unsigned long)-1;
1953 aname->type = type;
1954 list_add_tail(&aname->list, &context->names_list);
1955
1956 context->name_count++;
1957#if AUDIT_DEBUG
1958 context->ino_count++;
1959#endif
1960 return aname;
1961}
1962
1963/**
1964 * audit_reusename - fill out filename with info from existing entry
1965 * @uptr: userland ptr to pathname
1966 *
1967 * Search the audit_names list for the current audit context. If there is an
1968 * existing entry with a matching "uptr" then return the filename
1969 * associated with that audit_name. If not, return NULL.
1970 */
1971struct filename *
1972__audit_reusename(const __user char *uptr)
1973{
1974 struct audit_context *context = current->audit_context;
1975 struct audit_names *n;
1976
1977 list_for_each_entry(n, &context->names_list, list) {
1978 if (!n->name)
1979 continue;
1980 if (n->name->uptr == uptr)
1981 return n->name;
1982 }
1983 return NULL;
1984}
1985
1986/** 1824/**
1987 * audit_getname - add a name to the list 1825 * audit_getname - add a name to the list
1988 * @name: name to add 1826 * @name: name to add
@@ -1990,10 +1828,12 @@ __audit_reusename(const __user char *uptr)
1990 * Add a name to the list of audit names for this context. 1828 * Add a name to the list of audit names for this context.
1991 * Called from fs/namei.c:getname(). 1829 * Called from fs/namei.c:getname().
1992 */ 1830 */
1993void __audit_getname(struct filename *name) 1831void __audit_getname(const char *name)
1994{ 1832{
1995 struct audit_context *context = current->audit_context; 1833 struct audit_context *context = current->audit_context;
1996 struct audit_names *n; 1834
1835 if (IS_ERR(name) || !name)
1836 return;
1997 1837
1998 if (!context->in_syscall) { 1838 if (!context->in_syscall) {
1999#if AUDIT_DEBUG == 2 1839#if AUDIT_DEBUG == 2
@@ -2003,21 +1843,13 @@ void __audit_getname(struct filename *name)
2003#endif 1843#endif
2004 return; 1844 return;
2005 } 1845 }
2006 1846 BUG_ON(context->name_count >= AUDIT_NAMES);
2007#if AUDIT_DEBUG 1847 context->names[context->name_count].name = name;
2008 /* The filename _must_ have a populated ->name */ 1848 context->names[context->name_count].name_len = AUDIT_NAME_FULL;
2009 BUG_ON(!name->name); 1849 context->names[context->name_count].name_put = 1;
2010#endif 1850 context->names[context->name_count].ino = (unsigned long)-1;
2011 1851 context->names[context->name_count].osid = 0;
2012 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1852 ++context->name_count;
2013 if (!n)
2014 return;
2015
2016 n->name = name;
2017 n->name_len = AUDIT_NAME_FULL;
2018 n->name_put = true;
2019 name->aname = n;
2020
2021 if (!context->pwd.dentry) 1853 if (!context->pwd.dentry)
2022 get_fs_pwd(current->fs, &context->pwd); 1854 get_fs_pwd(current->fs, &context->pwd);
2023} 1855}
@@ -2029,7 +1861,7 @@ void __audit_getname(struct filename *name)
2029 * then we delay the putname until syscall exit. 1861 * then we delay the putname until syscall exit.
2030 * Called from include/linux/fs.h:putname(). 1862 * Called from include/linux/fs.h:putname().
2031 */ 1863 */
2032void audit_putname(struct filename *name) 1864void audit_putname(const char *name)
2033{ 1865{
2034 struct audit_context *context = current->audit_context; 1866 struct audit_context *context = current->audit_context;
2035 1867
@@ -2039,13 +1871,12 @@ void audit_putname(struct filename *name)
2039 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", 1871 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
2040 __FILE__, __LINE__, context->serial, name); 1872 __FILE__, __LINE__, context->serial, name);
2041 if (context->name_count) { 1873 if (context->name_count) {
2042 struct audit_names *n;
2043 int i; 1874 int i;
2044 1875 for (i = 0; i < context->name_count; i++)
2045 list_for_each_entry(n, &context->names_list, list)
2046 printk(KERN_ERR "name[%d] = %p = %s\n", i, 1876 printk(KERN_ERR "name[%d] = %p = %s\n", i,
2047 n->name, n->name->name ?: "(null)"); 1877 context->names[i].name,
2048 } 1878 context->names[i].name ?: "(null)");
1879 }
2049#endif 1880#endif
2050 __putname(name); 1881 __putname(name);
2051 } 1882 }
@@ -2058,19 +1889,47 @@ void audit_putname(struct filename *name)
2058 " put_count=%d\n", 1889 " put_count=%d\n",
2059 __FILE__, __LINE__, 1890 __FILE__, __LINE__,
2060 context->serial, context->major, 1891 context->serial, context->major,
2061 context->in_syscall, name->name, 1892 context->in_syscall, name, context->name_count,
2062 context->name_count, context->put_count); 1893 context->put_count);
2063 dump_stack(); 1894 dump_stack();
2064 } 1895 }
2065 } 1896 }
2066#endif 1897#endif
2067} 1898}
2068 1899
1900static int audit_inc_name_count(struct audit_context *context,
1901 const struct inode *inode)
1902{
1903 if (context->name_count >= AUDIT_NAMES) {
1904 if (inode)
1905 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1906 "dev=%02x:%02x, inode=%lu\n",
1907 MAJOR(inode->i_sb->s_dev),
1908 MINOR(inode->i_sb->s_dev),
1909 inode->i_ino);
1910
1911 else
1912 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1913 return 1;
1914 }
1915 context->name_count++;
1916#if AUDIT_DEBUG
1917 context->ino_count++;
1918#endif
1919 return 0;
1920}
1921
1922
2069static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) 1923static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
2070{ 1924{
2071 struct cpu_vfs_cap_data caps; 1925 struct cpu_vfs_cap_data caps;
2072 int rc; 1926 int rc;
2073 1927
1928 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1929 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1930 name->fcap.fE = 0;
1931 name->fcap_ver = 0;
1932
2074 if (!dentry) 1933 if (!dentry)
2075 return 0; 1934 return 0;
2076 1935
@@ -2102,84 +1961,44 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
2102} 1961}
2103 1962
2104/** 1963/**
2105 * __audit_inode - store the inode and device from a lookup 1964 * audit_inode - store the inode and device from a lookup
2106 * @name: name being audited 1965 * @name: name being audited
2107 * @dentry: dentry being audited 1966 * @dentry: dentry being audited
2108 * @parent: does this dentry represent the parent? 1967 *
1968 * Called from fs/namei.c:path_lookup().
2109 */ 1969 */
2110void __audit_inode(struct filename *name, const struct dentry *dentry, 1970void __audit_inode(const char *name, const struct dentry *dentry)
2111 unsigned int parent)
2112{ 1971{
1972 int idx;
2113 struct audit_context *context = current->audit_context; 1973 struct audit_context *context = current->audit_context;
2114 const struct inode *inode = dentry->d_inode; 1974 const struct inode *inode = dentry->d_inode;
2115 struct audit_names *n;
2116 1975
2117 if (!context->in_syscall) 1976 if (!context->in_syscall)
2118 return; 1977 return;
2119 1978 if (context->name_count
2120 if (!name) 1979 && context->names[context->name_count-1].name
2121 goto out_alloc; 1980 && context->names[context->name_count-1].name == name)
2122 1981 idx = context->name_count - 1;
2123#if AUDIT_DEBUG 1982 else if (context->name_count > 1
2124 /* The struct filename _must_ have a populated ->name */ 1983 && context->names[context->name_count-2].name
2125 BUG_ON(!name->name); 1984 && context->names[context->name_count-2].name == name)
2126#endif 1985 idx = context->name_count - 2;
2127 /* 1986 else {
2128 * If we have a pointer to an audit_names entry already, then we can 1987 /* FIXME: how much do we care about inodes that have no
2129 * just use it directly if the type is correct. 1988 * associated name? */
2130 */ 1989 if (audit_inc_name_count(context, inode))
2131 n = name->aname; 1990 return;
2132 if (n) { 1991 idx = context->name_count - 1;
2133 if (parent) { 1992 context->names[idx].name = NULL;
2134 if (n->type == AUDIT_TYPE_PARENT ||
2135 n->type == AUDIT_TYPE_UNKNOWN)
2136 goto out;
2137 } else {
2138 if (n->type != AUDIT_TYPE_PARENT)
2139 goto out;
2140 }
2141 }
2142
2143 list_for_each_entry_reverse(n, &context->names_list, list) {
2144 /* does the name pointer match? */
2145 if (!n->name || n->name->name != name->name)
2146 continue;
2147
2148 /* match the correct record type */
2149 if (parent) {
2150 if (n->type == AUDIT_TYPE_PARENT ||
2151 n->type == AUDIT_TYPE_UNKNOWN)
2152 goto out;
2153 } else {
2154 if (n->type != AUDIT_TYPE_PARENT)
2155 goto out;
2156 }
2157 }
2158
2159out_alloc:
2160 /* unable to find the name from a previous getname(). Allocate a new
2161 * anonymous entry.
2162 */
2163 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
2164 if (!n)
2165 return;
2166out:
2167 if (parent) {
2168 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
2169 n->type = AUDIT_TYPE_PARENT;
2170 } else {
2171 n->name_len = AUDIT_NAME_FULL;
2172 n->type = AUDIT_TYPE_NORMAL;
2173 } 1993 }
2174 handle_path(dentry); 1994 handle_path(dentry);
2175 audit_copy_inode(n, dentry, inode); 1995 audit_copy_inode(&context->names[idx], dentry, inode);
2176} 1996}
2177 1997
2178/** 1998/**
2179 * __audit_inode_child - collect inode info for created/removed objects 1999 * audit_inode_child - collect inode info for created/removed objects
2180 * @parent: inode of dentry parent
2181 * @dentry: dentry being audited 2000 * @dentry: dentry being audited
2182 * @type: AUDIT_TYPE_* value that we're looking for 2001 * @parent: inode of dentry parent
2183 * 2002 *
2184 * For syscalls that create or remove filesystem objects, audit_inode 2003 * For syscalls that create or remove filesystem objects, audit_inode
2185 * can only collect information for the filesystem object's parent. 2004 * can only collect information for the filesystem object's parent.
@@ -2189,14 +2008,15 @@ out:
2189 * must be hooked prior, in order to capture the target inode during 2008 * must be hooked prior, in order to capture the target inode during
2190 * unsuccessful attempts. 2009 * unsuccessful attempts.
2191 */ 2010 */
2192void __audit_inode_child(const struct inode *parent, 2011void __audit_inode_child(const struct dentry *dentry,
2193 const struct dentry *dentry, 2012 const struct inode *parent)
2194 const unsigned char type)
2195{ 2013{
2014 int idx;
2196 struct audit_context *context = current->audit_context; 2015 struct audit_context *context = current->audit_context;
2016 const char *found_parent = NULL, *found_child = NULL;
2197 const struct inode *inode = dentry->d_inode; 2017 const struct inode *inode = dentry->d_inode;
2198 const char *dname = dentry->d_name.name; 2018 const char *dname = dentry->d_name.name;
2199 struct audit_names *n, *found_parent = NULL, *found_child = NULL; 2019 int dirlen = 0;
2200 2020
2201 if (!context->in_syscall) 2021 if (!context->in_syscall)
2202 return; 2022 return;
@@ -2204,65 +2024,71 @@ void __audit_inode_child(const struct inode *parent,
2204 if (inode) 2024 if (inode)
2205 handle_one(inode); 2025 handle_one(inode);
2206 2026
2207 /* look for a parent entry first */ 2027 /* parent is more likely, look for it first */
2208 list_for_each_entry(n, &context->names_list, list) { 2028 for (idx = 0; idx < context->name_count; idx++) {
2209 if (!n->name || n->type != AUDIT_TYPE_PARENT) 2029 struct audit_names *n = &context->names[idx];
2030
2031 if (!n->name)
2210 continue; 2032 continue;
2211 2033
2212 if (n->ino == parent->i_ino && 2034 if (n->ino == parent->i_ino &&
2213 !audit_compare_dname_path(dname, n->name->name, n->name_len)) { 2035 !audit_compare_dname_path(dname, n->name, &dirlen)) {
2214 found_parent = n; 2036 n->name_len = dirlen; /* update parent data in place */
2215 break; 2037 found_parent = n->name;
2038 goto add_names;
2216 } 2039 }
2217 } 2040 }
2218 2041
2219 /* is there a matching child entry? */ 2042 /* no matching parent, look for matching child */
2220 list_for_each_entry(n, &context->names_list, list) { 2043 for (idx = 0; idx < context->name_count; idx++) {
2221 /* can only match entries that have a name */ 2044 struct audit_names *n = &context->names[idx];
2222 if (!n->name || n->type != type)
2223 continue;
2224 2045
2225 /* if we found a parent, make sure this one is a child of it */ 2046 if (!n->name)
2226 if (found_parent && (n->name != found_parent->name))
2227 continue; 2047 continue;
2228 2048
2229 if (!strcmp(dname, n->name->name) || 2049 /* strcmp() is the more likely scenario */
2230 !audit_compare_dname_path(dname, n->name->name, 2050 if (!strcmp(dname, n->name) ||
2231 found_parent ? 2051 !audit_compare_dname_path(dname, n->name, &dirlen)) {
2232 found_parent->name_len : 2052 if (inode)
2233 AUDIT_NAME_FULL)) { 2053 audit_copy_inode(n, NULL, inode);
2234 found_child = n; 2054 else
2235 break; 2055 n->ino = (unsigned long)-1;
2056 found_child = n->name;
2057 goto add_names;
2236 } 2058 }
2237 } 2059 }
2238 2060
2061add_names:
2239 if (!found_parent) { 2062 if (!found_parent) {
2240 /* create a new, "anonymous" parent record */ 2063 if (audit_inc_name_count(context, parent))
2241 n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
2242 if (!n)
2243 return; 2064 return;
2244 audit_copy_inode(n, NULL, parent); 2065 idx = context->name_count - 1;
2066 context->names[idx].name = NULL;
2067 audit_copy_inode(&context->names[idx], NULL, parent);
2245 } 2068 }
2246 2069
2247 if (!found_child) { 2070 if (!found_child) {
2248 found_child = audit_alloc_name(context, type); 2071 if (audit_inc_name_count(context, inode))
2249 if (!found_child)
2250 return; 2072 return;
2073 idx = context->name_count - 1;
2251 2074
2252 /* Re-use the name belonging to the slot for a matching parent 2075 /* Re-use the name belonging to the slot for a matching parent
2253 * directory. All names for this context are relinquished in 2076 * directory. All names for this context are relinquished in
2254 * audit_free_names() */ 2077 * audit_free_names() */
2255 if (found_parent) { 2078 if (found_parent) {
2256 found_child->name = found_parent->name; 2079 context->names[idx].name = found_parent;
2257 found_child->name_len = AUDIT_NAME_FULL; 2080 context->names[idx].name_len = AUDIT_NAME_FULL;
2258 /* don't call __putname() */ 2081 /* don't call __putname() */
2259 found_child->name_put = false; 2082 context->names[idx].name_put = 0;
2083 } else {
2084 context->names[idx].name = NULL;
2260 } 2085 }
2086
2087 if (inode)
2088 audit_copy_inode(&context->names[idx], NULL, inode);
2089 else
2090 context->names[idx].ino = (unsigned long)-1;
2261 } 2091 }
2262 if (inode)
2263 audit_copy_inode(found_child, dentry, inode);
2264 else
2265 found_child->ino = (unsigned long)-1;
2266} 2092}
2267EXPORT_SYMBOL_GPL(__audit_inode_child); 2093EXPORT_SYMBOL_GPL(__audit_inode_child);
2268 2094
@@ -2295,28 +2121,19 @@ int auditsc_get_stamp(struct audit_context *ctx,
2295static atomic_t session_id = ATOMIC_INIT(0); 2121static atomic_t session_id = ATOMIC_INIT(0);
2296 2122
2297/** 2123/**
2298 * audit_set_loginuid - set current task's audit_context loginuid 2124 * audit_set_loginuid - set a task's audit_context loginuid
2125 * @task: task whose audit context is being modified
2299 * @loginuid: loginuid value 2126 * @loginuid: loginuid value
2300 * 2127 *
2301 * Returns 0. 2128 * Returns 0.
2302 * 2129 *
2303 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2130 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2304 */ 2131 */
2305int audit_set_loginuid(kuid_t loginuid) 2132int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2306{ 2133{
2307 struct task_struct *task = current; 2134 unsigned int sessionid = atomic_inc_return(&session_id);
2308 struct audit_context *context = task->audit_context; 2135 struct audit_context *context = task->audit_context;
2309 unsigned int sessionid;
2310 2136
2311#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2312 if (uid_valid(task->loginuid))
2313 return -EPERM;
2314#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2315 if (!capable(CAP_AUDIT_CONTROL))
2316 return -EPERM;
2317#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2318
2319 sessionid = atomic_inc_return(&session_id);
2320 if (context && context->in_syscall) { 2137 if (context && context->in_syscall) {
2321 struct audit_buffer *ab; 2138 struct audit_buffer *ab;
2322 2139
@@ -2325,10 +2142,8 @@ int audit_set_loginuid(kuid_t loginuid)
2325 audit_log_format(ab, "login pid=%d uid=%u " 2142 audit_log_format(ab, "login pid=%d uid=%u "
2326 "old auid=%u new auid=%u" 2143 "old auid=%u new auid=%u"
2327 " old ses=%u new ses=%u", 2144 " old ses=%u new ses=%u",
2328 task->pid, 2145 task->pid, task_uid(task),
2329 from_kuid(&init_user_ns, task_uid(task)), 2146 task->loginuid, loginuid,
2330 from_kuid(&init_user_ns, task->loginuid),
2331 from_kuid(&init_user_ns, loginuid),
2332 task->sessionid, sessionid); 2147 task->sessionid, sessionid);
2333 audit_log_end(ab); 2148 audit_log_end(ab);
2334 } 2149 }
@@ -2345,7 +2160,7 @@ int audit_set_loginuid(kuid_t loginuid)
2345 * @attr: queue attributes 2160 * @attr: queue attributes
2346 * 2161 *
2347 */ 2162 */
2348void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) 2163void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
2349{ 2164{
2350 struct audit_context *context = current->audit_context; 2165 struct audit_context *context = current->audit_context;
2351 2166
@@ -2445,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2445 * 2260 *
2446 * Called only after audit_ipc_obj(). 2261 * Called only after audit_ipc_obj().
2447 */ 2262 */
2448void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) 2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
2449{ 2264{
2450 struct audit_context *context = current->audit_context; 2265 struct audit_context *context = current->audit_context;
2451 2266
@@ -2456,11 +2271,14 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
2456 context->ipc.has_perm = 1; 2271 context->ipc.has_perm = 1;
2457} 2272}
2458 2273
2459int __audit_bprm(struct linux_binprm *bprm) 2274int audit_bprm(struct linux_binprm *bprm)
2460{ 2275{
2461 struct audit_aux_data_execve *ax; 2276 struct audit_aux_data_execve *ax;
2462 struct audit_context *context = current->audit_context; 2277 struct audit_context *context = current->audit_context;
2463 2278
2279 if (likely(!audit_enabled || !context || context->dummy))
2280 return 0;
2281
2464 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2282 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2465 if (!ax) 2283 if (!ax)
2466 return -ENOMEM; 2284 return -ENOMEM;
@@ -2481,10 +2299,13 @@ int __audit_bprm(struct linux_binprm *bprm)
2481 * @args: args array 2299 * @args: args array
2482 * 2300 *
2483 */ 2301 */
2484void __audit_socketcall(int nargs, unsigned long *args) 2302void audit_socketcall(int nargs, unsigned long *args)
2485{ 2303{
2486 struct audit_context *context = current->audit_context; 2304 struct audit_context *context = current->audit_context;
2487 2305
2306 if (likely(!context || context->dummy))
2307 return;
2308
2488 context->type = AUDIT_SOCKETCALL; 2309 context->type = AUDIT_SOCKETCALL;
2489 context->socketcall.nargs = nargs; 2310 context->socketcall.nargs = nargs;
2490 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); 2311 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2510,10 +2331,13 @@ void __audit_fd_pair(int fd1, int fd2)
2510 * 2331 *
2511 * Returns 0 for success or NULL context or < 0 on error. 2332 * Returns 0 for success or NULL context or < 0 on error.
2512 */ 2333 */
2513int __audit_sockaddr(int len, void *a) 2334int audit_sockaddr(int len, void *a)
2514{ 2335{
2515 struct audit_context *context = current->audit_context; 2336 struct audit_context *context = current->audit_context;
2516 2337
2338 if (likely(!context || context->dummy))
2339 return 0;
2340
2517 if (!context->sockaddr) { 2341 if (!context->sockaddr) {
2518 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); 2342 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2519 if (!p) 2343 if (!p)
@@ -2551,12 +2375,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
2551 struct audit_aux_data_pids *axp; 2375 struct audit_aux_data_pids *axp;
2552 struct task_struct *tsk = current; 2376 struct task_struct *tsk = current;
2553 struct audit_context *ctx = tsk->audit_context; 2377 struct audit_context *ctx = tsk->audit_context;
2554 kuid_t uid = current_uid(), t_uid = task_uid(t); 2378 uid_t uid = current_uid(), t_uid = task_uid(t);
2555 2379
2556 if (audit_pid && t->tgid == audit_pid) { 2380 if (audit_pid && t->tgid == audit_pid) {
2557 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2381 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2558 audit_sig_pid = tsk->pid; 2382 audit_sig_pid = tsk->pid;
2559 if (uid_valid(tsk->loginuid)) 2383 if (tsk->loginuid != -1)
2560 audit_sig_uid = tsk->loginuid; 2384 audit_sig_uid = tsk->loginuid;
2561 else 2385 else
2562 audit_sig_uid = uid; 2386 audit_sig_uid = uid;
@@ -2675,33 +2499,6 @@ void __audit_mmap_fd(int fd, int flags)
2675 context->type = AUDIT_MMAP; 2499 context->type = AUDIT_MMAP;
2676} 2500}
2677 2501
2678static void audit_log_task(struct audit_buffer *ab)
2679{
2680 kuid_t auid, uid;
2681 kgid_t gid;
2682 unsigned int sessionid;
2683
2684 auid = audit_get_loginuid(current);
2685 sessionid = audit_get_sessionid(current);
2686 current_uid_gid(&uid, &gid);
2687
2688 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2689 from_kuid(&init_user_ns, auid),
2690 from_kuid(&init_user_ns, uid),
2691 from_kgid(&init_user_ns, gid),
2692 sessionid);
2693 audit_log_task_context(ab);
2694 audit_log_format(ab, " pid=%d comm=", current->pid);
2695 audit_log_untrustedstring(ab, current->comm);
2696}
2697
2698static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2699{
2700 audit_log_task(ab);
2701 audit_log_format(ab, " reason=");
2702 audit_log_string(ab, reason);
2703 audit_log_format(ab, " sig=%ld", signr);
2704}
2705/** 2502/**
2706 * audit_core_dumps - record information about processes that end abnormally 2503 * audit_core_dumps - record information about processes that end abnormally
2707 * @signr: signal value 2504 * @signr: signal value
@@ -2712,6 +2509,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2712void audit_core_dumps(long signr) 2509void audit_core_dumps(long signr)
2713{ 2510{
2714 struct audit_buffer *ab; 2511 struct audit_buffer *ab;
2512 u32 sid;
2513 uid_t auid = audit_get_loginuid(current), uid;
2514 gid_t gid;
2515 unsigned int sessionid = audit_get_sessionid(current);
2715 2516
2716 if (!audit_enabled) 2517 if (!audit_enabled)
2717 return; 2518 return;
@@ -2720,25 +2521,24 @@ void audit_core_dumps(long signr)
2720 return; 2521 return;
2721 2522
2722 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2523 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2723 if (unlikely(!ab)) 2524 current_uid_gid(&uid, &gid);
2724 return; 2525 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2725 audit_log_abend(ab, "memory violation", signr); 2526 auid, uid, gid, sessionid);
2726 audit_log_end(ab); 2527 security_task_getsecid(current, &sid);
2727} 2528 if (sid) {
2728 2529 char *ctx = NULL;
2729void __audit_seccomp(unsigned long syscall, long signr, int code) 2530 u32 len;
2730{
2731 struct audit_buffer *ab;
2732 2531
2733 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); 2532 if (security_secid_to_secctx(sid, &ctx, &len))
2734 if (unlikely(!ab)) 2533 audit_log_format(ab, " ssid=%u", sid);
2735 return; 2534 else {
2736 audit_log_task(ab); 2535 audit_log_format(ab, " subj=%s", ctx);
2536 security_release_secctx(ctx, len);
2537 }
2538 }
2539 audit_log_format(ab, " pid=%d comm=", current->pid);
2540 audit_log_untrustedstring(ab, current->comm);
2737 audit_log_format(ab, " sig=%ld", signr); 2541 audit_log_format(ab, " sig=%ld", signr);
2738 audit_log_format(ab, " syscall=%ld", syscall);
2739 audit_log_format(ab, " compat=%d", is_compat_task());
2740 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2741 audit_log_format(ab, " code=0x%x", code);
2742 audit_log_end(ab); 2542 audit_log_end(ab);
2743} 2543}
2744 2544
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d9725948..283c529f8b1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
10#include <linux/audit.h> 10#include <linux/audit.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/export.h> 13#include <linux/module.h>
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
@@ -287,84 +287,74 @@ error:
287} 287}
288 288
289/** 289/**
290 * has_ns_capability - Does a task have a capability in a specific user ns 290 * has_capability - Does a task have a capability in init_user_ns
291 * @t: The task in question 291 * @t: The task in question
292 * @ns: target user namespace
293 * @cap: The capability to be tested for 292 * @cap: The capability to be tested for
294 * 293 *
295 * Return true if the specified task has the given superior capability 294 * Return true if the specified task has the given superior capability
296 * currently in effect to the specified user namespace, false if not. 295 * currently in effect to the initial user namespace, false if not.
297 * 296 *
298 * Note that this does not set PF_SUPERPRIV on the task. 297 * Note that this does not set PF_SUPERPRIV on the task.
299 */ 298 */
300bool has_ns_capability(struct task_struct *t, 299bool has_capability(struct task_struct *t, int cap)
301 struct user_namespace *ns, int cap)
302{ 300{
303 int ret; 301 int ret = security_real_capable(t, &init_user_ns, cap);
304
305 rcu_read_lock();
306 ret = security_capable(__task_cred(t), ns, cap);
307 rcu_read_unlock();
308 302
309 return (ret == 0); 303 return (ret == 0);
310} 304}
311 305
312/** 306/**
313 * has_capability - Does a task have a capability in init_user_ns 307 * has_capability - Does a task have a capability in a specific user ns
314 * @t: The task in question 308 * @t: The task in question
309 * @ns: target user namespace
315 * @cap: The capability to be tested for 310 * @cap: The capability to be tested for
316 * 311 *
317 * Return true if the specified task has the given superior capability 312 * Return true if the specified task has the given superior capability
318 * currently in effect to the initial user namespace, false if not. 313 * currently in effect to the specified user namespace, false if not.
319 * 314 *
320 * Note that this does not set PF_SUPERPRIV on the task. 315 * Note that this does not set PF_SUPERPRIV on the task.
321 */ 316 */
322bool has_capability(struct task_struct *t, int cap) 317bool has_ns_capability(struct task_struct *t,
318 struct user_namespace *ns, int cap)
323{ 319{
324 return has_ns_capability(t, &init_user_ns, cap); 320 int ret = security_real_capable(t, ns, cap);
321
322 return (ret == 0);
325} 323}
326 324
327/** 325/**
328 * has_ns_capability_noaudit - Does a task have a capability (unaudited) 326 * has_capability_noaudit - Does a task have a capability (unaudited)
329 * in a specific user ns.
330 * @t: The task in question 327 * @t: The task in question
331 * @ns: target user namespace
332 * @cap: The capability to be tested for 328 * @cap: The capability to be tested for
333 * 329 *
334 * Return true if the specified task has the given superior capability 330 * Return true if the specified task has the given superior capability
335 * currently in effect to the specified user namespace, false if not. 331 * currently in effect to init_user_ns, false if not. Don't write an
336 * Do not write an audit message for the check. 332 * audit message for the check.
337 * 333 *
338 * Note that this does not set PF_SUPERPRIV on the task. 334 * Note that this does not set PF_SUPERPRIV on the task.
339 */ 335 */
340bool has_ns_capability_noaudit(struct task_struct *t, 336bool has_capability_noaudit(struct task_struct *t, int cap)
341 struct user_namespace *ns, int cap)
342{ 337{
343 int ret; 338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
344
345 rcu_read_lock();
346 ret = security_capable_noaudit(__task_cred(t), ns, cap);
347 rcu_read_unlock();
348 339
349 return (ret == 0); 340 return (ret == 0);
350} 341}
351 342
352/** 343/**
353 * has_capability_noaudit - Does a task have a capability (unaudited) in the 344 * capable - Determine if the current task has a superior capability in effect
354 * initial user ns
355 * @t: The task in question
356 * @cap: The capability to be tested for 345 * @cap: The capability to be tested for
357 * 346 *
358 * Return true if the specified task has the given superior capability 347 * Return true if the current task has the given superior capability currently
359 * currently in effect to init_user_ns, false if not. Don't write an 348 * available for use, false if not.
360 * audit message for the check.
361 * 349 *
362 * Note that this does not set PF_SUPERPRIV on the task. 350 * This sets PF_SUPERPRIV on the task if the capability is available on the
351 * assumption that it's about to be used.
363 */ 352 */
364bool has_capability_noaudit(struct task_struct *t, int cap) 353bool capable(int cap)
365{ 354{
366 return has_ns_capability_noaudit(t, &init_user_ns, cap); 355 return ns_capable(&init_user_ns, cap);
367} 356}
357EXPORT_SYMBOL(capable);
368 358
369/** 359/**
370 * ns_capable - Determine if the current task has a superior capability in effect 360 * ns_capable - Determine if the current task has a superior capability in effect
@@ -384,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
384 BUG(); 374 BUG();
385 } 375 }
386 376
387 if (security_capable(current_cred(), ns, cap) == 0) { 377 if (security_capable(ns, current_cred(), cap) == 0) {
388 current->flags |= PF_SUPERPRIV; 378 current->flags |= PF_SUPERPRIV;
389 return true; 379 return true;
390 } 380 }
@@ -393,20 +383,18 @@ bool ns_capable(struct user_namespace *ns, int cap)
393EXPORT_SYMBOL(ns_capable); 383EXPORT_SYMBOL(ns_capable);
394 384
395/** 385/**
396 * capable - Determine if the current task has a superior capability in effect 386 * task_ns_capable - Determine whether current task has a superior
397 * @cap: The capability to be tested for 387 * capability targeted at a specific task's user namespace.
388 * @t: The task whose user namespace is targeted.
389 * @cap: The capability in question.
398 * 390 *
399 * Return true if the current task has the given superior capability currently 391 * Return true if it does, false otherwise.
400 * available for use, false if not.
401 *
402 * This sets PF_SUPERPRIV on the task if the capability is available on the
403 * assumption that it's about to be used.
404 */ 392 */
405bool capable(int cap) 393bool task_ns_capable(struct task_struct *t, int cap)
406{ 394{
407 return ns_capable(&init_user_ns, cap); 395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
408} 396}
409EXPORT_SYMBOL(capable); 397EXPORT_SYMBOL(task_ns_capable);
410 398
411/** 399/**
412 * nsown_capable - Check superior capability to one's own user_ns 400 * nsown_capable - Check superior capability to one's own user_ns
@@ -419,24 +407,3 @@ bool nsown_capable(int cap)
419{ 407{
420 return ns_capable(current_user_ns(), cap); 408 return ns_capable(current_user_ns(), cap);
421} 409}
422
423/**
424 * inode_capable - Check superior capability over inode
425 * @inode: The inode in question
426 * @cap: The capability in question
427 *
428 * Return true if the current task has the given superior capability
429 * targeted at it's own user namespace and that the given inode is owned
430 * by the current user namespace or a child namespace.
431 *
432 * Currently we check to see if an inode is owned by the current
433 * user namespace by seeing if the inode's owner maps into the
434 * current user namespace.
435 *
436 */
437bool inode_capable(const struct inode *inode, int cap)
438{
439 struct user_namespace *ns = current_user_ns();
440
441 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
442}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798f..54a36fe288f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,40 +60,18 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
66 65
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
70/*
71 * cgroup_mutex is the master lock. Any modification to cgroup or its
72 * hierarchy must be performed while holding it.
73 *
74 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
75 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
76 * release_agent_path and so on. Modifying requires both cgroup_mutex and
77 * cgroup_root_mutex. Readers can acquire either of the two. This is to
78 * break the following locking order cycle.
79 *
80 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
81 * B. namespace_sem -> cgroup_mutex
82 *
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it.
85 */
86static DEFINE_MUTEX(cgroup_mutex); 66static DEFINE_MUTEX(cgroup_mutex);
87static DEFINE_MUTEX(cgroup_root_mutex);
88 67
89/* 68/*
90 * Generate an array of cgroup subsystem pointers. At boot time, this is 69 * Generate an array of cgroup subsystem pointers. At boot time, this is
91 * populated with the built in subsystems, and modular subsystems are 70 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
92 * registered after that. The mutable section of this array is protected by 71 * registered after that. The mutable section of this array is protected by
93 * cgroup_mutex. 72 * cgroup_mutex.
94 */ 73 */
95#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 74#define SUBSYS(_x) &_x ## _subsys,
96#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
97static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 75static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 76#include <linux/cgroup_subsys.h>
99}; 77};
@@ -112,13 +90,13 @@ struct cgroupfs_root {
112 * The bitmask of subsystems intended to be attached to this 90 * The bitmask of subsystems intended to be attached to this
113 * hierarchy 91 * hierarchy
114 */ 92 */
115 unsigned long subsys_mask; 93 unsigned long subsys_bits;
116 94
117 /* Unique id for this hierarchy. */ 95 /* Unique id for this hierarchy. */
118 int hierarchy_id; 96 int hierarchy_id;
119 97
120 /* The bitmask of subsystems currently attached to this hierarchy */ 98 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask; 99 unsigned long actual_subsys_bits;
122 100
123 /* A list running through the attached subsystems */ 101 /* A list running through the attached subsystems */
124 struct list_head subsys_list; 102 struct list_head subsys_list;
@@ -132,15 +110,9 @@ struct cgroupfs_root {
132 /* A list running through the active hierarchies */ 110 /* A list running through the active hierarchies */
133 struct list_head root_list; 111 struct list_head root_list;
134 112
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */ 113 /* Hierarchy-specific flags */
139 unsigned long flags; 114 unsigned long flags;
140 115
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */ 116 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX]; 117 char release_agent_path[PATH_MAX];
146 118
@@ -156,15 +128,6 @@ struct cgroupfs_root {
156static struct cgroupfs_root rootnode; 128static struct cgroupfs_root rootnode;
157 129
158/* 130/*
159 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
160 */
161struct cfent {
162 struct list_head node;
163 struct dentry *dentry;
164 struct cftype *type;
165};
166
167/*
168 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 131 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
169 * cgroup_subsys->use_id != 0. 132 * cgroup_subsys->use_id != 0.
170 */ 133 */
@@ -174,8 +137,8 @@ struct css_id {
174 * The css to which this ID points. This pointer is set to valid value 137 * The css to which this ID points. This pointer is set to valid value
175 * after cgroup is populated. If cgroup is removed, this will be NULL. 138 * after cgroup is populated. If cgroup is removed, this will be NULL.
176 * This pointer is expected to be RCU-safe because destroy() 139 * This pointer is expected to be RCU-safe because destroy()
177 * is called after synchronize_rcu(). But for safe use, css_tryget() 140 * is called after synchronize_rcu(). But for safe use, css_is_removed()
178 * should be used for avoiding race. 141 * css_tryget() should be used for avoiding race.
179 */ 142 */
180 struct cgroup_subsys_state __rcu *css; 143 struct cgroup_subsys_state __rcu *css;
181 /* 144 /*
@@ -245,10 +208,6 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
245 */ 208 */
246static int need_forkexit_callback __read_mostly; 209static int need_forkexit_callback __read_mostly;
247 210
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
252#ifdef CONFIG_PROVE_LOCKING 211#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void) 212int cgroup_lock_is_held(void)
254{ 213{
@@ -263,19 +222,6 @@ int cgroup_lock_is_held(void)
263 222
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 223EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265 224
266static int css_unbias_refcnt(int refcnt)
267{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
269}
270
271/* the current nr of refs, always >= 0 whether @css is deactivated or not */
272static int css_refcnt(struct cgroup_subsys_state *css)
273{
274 int v = atomic_read(&css->refcnt);
275
276 return css_unbias_refcnt(v);
277}
278
279/* convenient tests for these bits */ 225/* convenient tests for these bits */
280inline int cgroup_is_removed(const struct cgroup *cgrp) 226inline int cgroup_is_removed(const struct cgroup *cgrp)
281{ 227{
@@ -284,8 +230,7 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
284 230
285/* bits in struct cgroupfs_root flags field */ 231/* bits in struct cgroupfs_root flags field */
286enum { 232enum {
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
288 ROOT_XATTR, /* supports extended attributes */
289}; 234};
290 235
291static int cgroup_is_releasable(const struct cgroup *cgrp) 236static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -301,6 +246,11 @@ static int notify_on_release(const struct cgroup *cgrp)
301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 246 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
302} 247}
303 248
249static int clone_children(const struct cgroup *cgrp)
250{
251 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
252}
253
304/* 254/*
305 * for_each_subsys() allows you to iterate on each subsystem attached to 255 * for_each_subsys() allows you to iterate on each subsystem attached to
306 * an active hierarchy 256 * an active hierarchy
@@ -312,29 +262,41 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
312#define for_each_active_root(_root) \ 262#define for_each_active_root(_root) \
313list_for_each_entry(_root, &roots, root_list) 263list_for_each_entry(_root, &roots, root_list)
314 264
315static inline struct cgroup *__d_cgrp(struct dentry *dentry) 265/* the list of cgroups eligible for automatic release. Protected by
266 * release_list_lock */
267static LIST_HEAD(release_list);
268static DEFINE_SPINLOCK(release_list_lock);
269static void cgroup_release_agent(struct work_struct *work);
270static DECLARE_WORK(release_agent_work, cgroup_release_agent);
271static void check_for_release(struct cgroup *cgrp);
272
273/*
274 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
275 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
276 * reference to css->refcnt. In general, this refcnt is expected to goes down
277 * to zero, soon.
278 *
279 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
280 */
281DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
282
283static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
316{ 284{
317 return dentry->d_fsdata; 285 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
286 wake_up_all(&cgroup_rmdir_waitq);
318} 287}
319 288
320static inline struct cfent *__d_cfe(struct dentry *dentry) 289void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
321{ 290{
322 return dentry->d_fsdata; 291 css_get(css);
323} 292}
324 293
325static inline struct cftype *__d_cft(struct dentry *dentry) 294void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
326{ 295{
327 return __d_cfe(dentry)->type; 296 cgroup_wakeup_rmdir_waiter(css->cgroup);
297 css_put(css);
328} 298}
329 299
330/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */
332static LIST_HEAD(release_list);
333static DEFINE_RAW_SPINLOCK(release_list_lock);
334static void cgroup_release_agent(struct work_struct *work);
335static DECLARE_WORK(release_agent_work, cgroup_release_agent);
336static void check_for_release(struct cgroup *cgrp);
337
338/* Link structure for associating css_set objects with cgroups */ 300/* Link structure for associating css_set objects with cgroups */
339struct cg_cgroup_link { 301struct cg_cgroup_link {
340 /* 302 /*
@@ -394,52 +356,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
394 return &css_set_table[index]; 356 return &css_set_table[index];
395} 357}
396 358
397/* We don't maintain the lists running through each css_set to its 359static void free_css_set_work(struct work_struct *work)
398 * task until after the first call to cgroup_iter_start(). This
399 * reduces the fork()/exit() overhead for people who have cgroups
400 * compiled into their kernel but not actually in use */
401static int use_task_css_set_links __read_mostly;
402
403static void __put_css_set(struct css_set *cg, int taskexit)
404{ 360{
361 struct css_set *cg = container_of(work, struct css_set, work);
405 struct cg_cgroup_link *link; 362 struct cg_cgroup_link *link;
406 struct cg_cgroup_link *saved_link; 363 struct cg_cgroup_link *saved_link;
407 /*
408 * Ensure that the refcount doesn't hit zero while any readers
409 * can see it. Similar to atomic_dec_and_lock(), but for an
410 * rwlock
411 */
412 if (atomic_add_unless(&cg->refcount, -1, 1))
413 return;
414 write_lock(&css_set_lock);
415 if (!atomic_dec_and_test(&cg->refcount)) {
416 write_unlock(&css_set_lock);
417 return;
418 }
419
420 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist);
422 css_set_count--;
423 364
365 write_lock(&css_set_lock);
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 366 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
425 cg_link_list) { 367 cg_link_list) {
426 struct cgroup *cgrp = link->cgrp; 368 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 369 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 370 list_del(&link->cgrp_link_list);
429 if (atomic_dec_and_test(&cgrp->count) && 371 if (atomic_dec_and_test(&cgrp->count)) {
430 notify_on_release(cgrp)) {
431 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 372 check_for_release(cgrp);
373 cgroup_wakeup_rmdir_waiter(cgrp);
434 } 374 }
435
436 kfree(link); 375 kfree(link);
437 } 376 }
438
439 write_unlock(&css_set_lock); 377 write_unlock(&css_set_lock);
440 kfree_rcu(cg, rcu_head); 378
379 kfree(cg);
380}
381
382static void free_css_set_rcu(struct rcu_head *obj)
383{
384 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
385
386 INIT_WORK(&cg->work, free_css_set_work);
387 schedule_work(&cg->work);
441} 388}
442 389
390/* We don't maintain the lists running through each css_set to its
391 * task until after the first call to cgroup_iter_start(). This
392 * reduces the fork()/exit() overhead for people who have cgroups
393 * compiled into their kernel but not actually in use */
394static int use_task_css_set_links __read_mostly;
395
443/* 396/*
444 * refcounted get/put for css_set objects 397 * refcounted get/put for css_set objects
445 */ 398 */
@@ -448,14 +401,26 @@ static inline void get_css_set(struct css_set *cg)
448 atomic_inc(&cg->refcount); 401 atomic_inc(&cg->refcount);
449} 402}
450 403
451static inline void put_css_set(struct css_set *cg) 404static void put_css_set(struct css_set *cg)
452{ 405{
453 __put_css_set(cg, 0); 406 /*
454} 407 * Ensure that the refcount doesn't hit zero while any readers
408 * can see it. Similar to atomic_dec_and_lock(), but for an
409 * rwlock
410 */
411 if (atomic_add_unless(&cg->refcount, -1, 1))
412 return;
413 write_lock(&css_set_lock);
414 if (!atomic_dec_and_test(&cg->refcount)) {
415 write_unlock(&css_set_lock);
416 return;
417 }
455 418
456static inline void put_css_set_taskexit(struct css_set *cg) 419 hlist_del(&cg->hlist);
457{ 420 css_set_count--;
458 __put_css_set(cg, 1); 421
422 write_unlock(&css_set_lock);
423 call_rcu(&cg->rcu_head, free_css_set_rcu);
459} 424}
460 425
461/* 426/*
@@ -560,7 +525,7 @@ static struct css_set *find_existing_css_set(
560 * won't change, so no need for locking. 525 * won't change, so no need for locking.
561 */ 526 */
562 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 527 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
563 if (root->subsys_mask & (1UL << i)) { 528 if (root->subsys_bits & (1UL << i)) {
564 /* Subsystem is in this hierarchy. So we want 529 /* Subsystem is in this hierarchy. So we want
565 * the subsystem state from the new 530 * the subsystem state from the new
566 * cgroup */ 531 * cgroup */
@@ -784,12 +749,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
784 * The task_lock() exception 749 * The task_lock() exception
785 * 750 *
786 * The need for this exception arises from the action of 751 * The need for this exception arises from the action of
787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with 752 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
788 * another. It does so using cgroup_mutex, however there are 753 * another. It does so using cgroup_mutex, however there are
789 * several performance critical places that need to reference 754 * several performance critical places that need to reference
790 * task->cgroup without the expense of grabbing a system global 755 * task->cgroups without the expense of grabbing a system global
791 * mutex. Therefore except as noted below, when dereferencing or, as 756 * mutex. Therefore except as noted below, when dereferencing or, as
792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use 757 * in cgroup_attach_task(), modifying a task's cgroups pointer we use
793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 758 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
794 * the task_struct routinely used for such matters. 759 * the task_struct routinely used for such matters.
795 * 760 *
@@ -825,11 +790,10 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
825 * -> cgroup_mkdir. 790 * -> cgroup_mkdir.
826 */ 791 */
827 792
828static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 793static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
829static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); 794static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
830static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 795static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
831static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 796static int cgroup_populate_dir(struct cgroup *cgrp);
832 unsigned long subsys_mask);
833static const struct inode_operations cgroup_dir_inode_operations; 797static const struct inode_operations cgroup_dir_inode_operations;
834static const struct file_operations proc_cgroupstats_operations; 798static const struct file_operations proc_cgroupstats_operations;
835 799
@@ -841,7 +805,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
841static int alloc_css_id(struct cgroup_subsys *ss, 805static int alloc_css_id(struct cgroup_subsys *ss,
842 struct cgroup *parent, struct cgroup *child); 806 struct cgroup *parent, struct cgroup *child);
843 807
844static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 808static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
845{ 809{
846 struct inode *inode = new_inode(sb); 810 struct inode *inode = new_inode(sb);
847 811
@@ -856,6 +820,25 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 820 return inode;
857} 821}
858 822
823/*
824 * Call subsys's pre_destroy handler.
825 * This is called before css refcnt check.
826 */
827static int cgroup_call_pre_destroy(struct cgroup *cgrp)
828{
829 struct cgroup_subsys *ss;
830 int ret = 0;
831
832 for_each_subsys(cgrp->root, ss)
833 if (ss->pre_destroy) {
834 ret = ss->pre_destroy(ss, cgrp);
835 if (ret)
836 break;
837 }
838
839 return ret;
840}
841
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 842static void cgroup_diput(struct dentry *dentry, struct inode *inode)
860{ 843{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 844 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -876,7 +859,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
876 * Release the subsystem state objects. 859 * Release the subsystem state objects.
877 */ 860 */
878 for_each_subsys(cgrp->root, ss) 861 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 862 ss->destroy(ss, cgrp);
880 863
881 cgrp->root->number_of_cgroups--; 864 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 865 mutex_unlock(&cgroup_mutex);
@@ -893,20 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
893 */ 876 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 877 BUG_ON(!list_empty(&cgrp->pidlists));
895 878
896 simple_xattrs_free(&cgrp->xattrs);
897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
899 kfree_rcu(cgrp, rcu_head); 879 kfree_rcu(cgrp, rcu_head);
900 } else {
901 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
903 struct cftype *cft = cfe->type;
904
905 WARN_ONCE(!list_empty(&cfe->node) &&
906 cgrp != &cgrp->root->top_cgroup,
907 "cfe still linked for %s\n", cfe->type->name);
908 kfree(cfe);
909 simple_xattrs_free(&cft->xattrs);
910 } 880 }
911 iput(inode); 881 iput(inode);
912} 882}
@@ -925,53 +895,34 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 895 dput(parent);
926} 896}
927 897
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 898static void cgroup_clear_directory(struct dentry *dentry)
929{ 899{
930 struct cfent *cfe; 900 struct list_head *node;
931 901
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 902 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
933 lockdep_assert_held(&cgroup_mutex); 903 spin_lock(&dentry->d_lock);
934 904 node = dentry->d_subdirs.next;
935 list_for_each_entry(cfe, &cgrp->files, node) { 905 while (node != &dentry->d_subdirs) {
936 struct dentry *d = cfe->dentry; 906 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
937 907
938 if (cft && cfe->type != cft) 908 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
939 continue; 909 list_del_init(node);
940 910 if (d->d_inode) {
941 dget(d); 911 /* This should never be called on a cgroup
942 d_delete(d); 912 * directory with child cgroups */
943 simple_unlink(cgrp->dentry->d_inode, d); 913 BUG_ON(d->d_inode->i_mode & S_IFDIR);
944 list_del_init(&cfe->node); 914 dget_dlock(d);
945 dput(d); 915 spin_unlock(&d->d_lock);
946 916 spin_unlock(&dentry->d_lock);
947 return 0; 917 d_delete(d);
948 } 918 simple_unlink(dentry->d_inode, d);
949 return -ENOENT; 919 dput(d);
950} 920 spin_lock(&dentry->d_lock);
951 921 } else
952/** 922 spin_unlock(&d->d_lock);
953 * cgroup_clear_directory - selective removal of base and subsystem files 923 node = dentry->d_subdirs.next;
954 * @dir: directory containing the files
955 * @base_files: true if the base files should be removed
956 * @subsys_mask: mask of the subsystem ids whose files should be removed
957 */
958static void cgroup_clear_directory(struct dentry *dir, bool base_files,
959 unsigned long subsys_mask)
960{
961 struct cgroup *cgrp = __d_cgrp(dir);
962 struct cgroup_subsys *ss;
963
964 for_each_subsys(cgrp->root, ss) {
965 struct cftype_set *set;
966 if (!test_bit(ss->subsys_id, &subsys_mask))
967 continue;
968 list_for_each_entry(set, &ss->cftsets, node)
969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
970 }
971 if (base_files) {
972 while (!list_empty(&cgrp->files))
973 cgroup_rm_file(cgrp, NULL);
974 } 924 }
925 spin_unlock(&dentry->d_lock);
975} 926}
976 927
977/* 928/*
@@ -980,9 +931,8 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
980static void cgroup_d_remove_dir(struct dentry *dentry) 931static void cgroup_d_remove_dir(struct dentry *dentry)
981{ 932{
982 struct dentry *parent; 933 struct dentry *parent;
983 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
984 934
985 cgroup_clear_directory(dentry, true, root->subsys_mask); 935 cgroup_clear_directory(dentry);
986 936
987 parent = dentry->d_parent; 937 parent = dentry->d_parent;
988 spin_lock(&parent->d_lock); 938 spin_lock(&parent->d_lock);
@@ -999,22 +949,21 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
999 * returns an error, no reference counts are touched. 949 * returns an error, no reference counts are touched.
1000 */ 950 */
1001static int rebind_subsystems(struct cgroupfs_root *root, 951static int rebind_subsystems(struct cgroupfs_root *root,
1002 unsigned long final_subsys_mask) 952 unsigned long final_bits)
1003{ 953{
1004 unsigned long added_mask, removed_mask; 954 unsigned long added_bits, removed_bits;
1005 struct cgroup *cgrp = &root->top_cgroup; 955 struct cgroup *cgrp = &root->top_cgroup;
1006 int i; 956 int i;
1007 957
1008 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 958 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1009 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1010 959
1011 removed_mask = root->actual_subsys_mask & ~final_subsys_mask; 960 removed_bits = root->actual_subsys_bits & ~final_bits;
1012 added_mask = final_subsys_mask & ~root->actual_subsys_mask; 961 added_bits = final_bits & ~root->actual_subsys_bits;
1013 /* Check that any added subsystems are currently free */ 962 /* Check that any added subsystems are currently free */
1014 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 963 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1015 unsigned long bit = 1UL << i; 964 unsigned long bit = 1UL << i;
1016 struct cgroup_subsys *ss = subsys[i]; 965 struct cgroup_subsys *ss = subsys[i];
1017 if (!(bit & added_mask)) 966 if (!(bit & added_bits))
1018 continue; 967 continue;
1019 /* 968 /*
1020 * Nobody should tell us to do a subsys that doesn't exist: 969 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1039,33 +988,37 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 988 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 struct cgroup_subsys *ss = subsys[i]; 989 struct cgroup_subsys *ss = subsys[i];
1041 unsigned long bit = 1UL << i; 990 unsigned long bit = 1UL << i;
1042 if (bit & added_mask) { 991 if (bit & added_bits) {
1043 /* We're binding this subsystem to this hierarchy */ 992 /* We're binding this subsystem to this hierarchy */
1044 BUG_ON(ss == NULL); 993 BUG_ON(ss == NULL);
1045 BUG_ON(cgrp->subsys[i]); 994 BUG_ON(cgrp->subsys[i]);
1046 BUG_ON(!dummytop->subsys[i]); 995 BUG_ON(!dummytop->subsys[i]);
1047 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 996 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
997 mutex_lock(&ss->hierarchy_mutex);
1048 cgrp->subsys[i] = dummytop->subsys[i]; 998 cgrp->subsys[i] = dummytop->subsys[i];
1049 cgrp->subsys[i]->cgroup = cgrp; 999 cgrp->subsys[i]->cgroup = cgrp;
1050 list_move(&ss->sibling, &root->subsys_list); 1000 list_move(&ss->sibling, &root->subsys_list);
1051 ss->root = root; 1001 ss->root = root;
1052 if (ss->bind) 1002 if (ss->bind)
1053 ss->bind(cgrp); 1003 ss->bind(ss, cgrp);
1004 mutex_unlock(&ss->hierarchy_mutex);
1054 /* refcount was already taken, and we're keeping it */ 1005 /* refcount was already taken, and we're keeping it */
1055 } else if (bit & removed_mask) { 1006 } else if (bit & removed_bits) {
1056 /* We're removing this subsystem */ 1007 /* We're removing this subsystem */
1057 BUG_ON(ss == NULL); 1008 BUG_ON(ss == NULL);
1058 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1009 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1010 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1011 mutex_lock(&ss->hierarchy_mutex);
1060 if (ss->bind) 1012 if (ss->bind)
1061 ss->bind(dummytop); 1013 ss->bind(ss, dummytop);
1062 dummytop->subsys[i]->cgroup = dummytop; 1014 dummytop->subsys[i]->cgroup = dummytop;
1063 cgrp->subsys[i] = NULL; 1015 cgrp->subsys[i] = NULL;
1064 subsys[i]->root = &rootnode; 1016 subsys[i]->root = &rootnode;
1065 list_move(&ss->sibling, &rootnode.subsys_list); 1017 list_move(&ss->sibling, &rootnode.subsys_list);
1018 mutex_unlock(&ss->hierarchy_mutex);
1066 /* subsystem is now free - drop reference on module */ 1019 /* subsystem is now free - drop reference on module */
1067 module_put(ss->module); 1020 module_put(ss->module);
1068 } else if (bit & final_subsys_mask) { 1021 } else if (bit & final_bits) {
1069 /* Subsystem state should already exist */ 1022 /* Subsystem state should already exist */
1070 BUG_ON(ss == NULL); 1023 BUG_ON(ss == NULL);
1071 BUG_ON(!cgrp->subsys[i]); 1024 BUG_ON(!cgrp->subsys[i]);
@@ -1082,39 +1035,37 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1082 BUG_ON(cgrp->subsys[i]); 1035 BUG_ON(cgrp->subsys[i]);
1083 } 1036 }
1084 } 1037 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1038 root->subsys_bits = root->actual_subsys_bits = final_bits;
1086 synchronize_rcu(); 1039 synchronize_rcu();
1087 1040
1088 return 0; 1041 return 0;
1089} 1042}
1090 1043
1091static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1044static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1092{ 1045{
1093 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1046 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
1094 struct cgroup_subsys *ss; 1047 struct cgroup_subsys *ss;
1095 1048
1096 mutex_lock(&cgroup_root_mutex); 1049 mutex_lock(&cgroup_mutex);
1097 for_each_subsys(root, ss) 1050 for_each_subsys(root, ss)
1098 seq_printf(seq, ",%s", ss->name); 1051 seq_printf(seq, ",%s", ss->name);
1099 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1052 if (test_bit(ROOT_NOPREFIX, &root->flags))
1100 seq_puts(seq, ",noprefix"); 1053 seq_puts(seq, ",noprefix");
1101 if (test_bit(ROOT_XATTR, &root->flags))
1102 seq_puts(seq, ",xattr");
1103 if (strlen(root->release_agent_path)) 1054 if (strlen(root->release_agent_path))
1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1055 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1056 if (clone_children(&root->top_cgroup))
1106 seq_puts(seq, ",clone_children"); 1057 seq_puts(seq, ",clone_children");
1107 if (strlen(root->name)) 1058 if (strlen(root->name))
1108 seq_printf(seq, ",name=%s", root->name); 1059 seq_printf(seq, ",name=%s", root->name);
1109 mutex_unlock(&cgroup_root_mutex); 1060 mutex_unlock(&cgroup_mutex);
1110 return 0; 1061 return 0;
1111} 1062}
1112 1063
1113struct cgroup_sb_opts { 1064struct cgroup_sb_opts {
1114 unsigned long subsys_mask; 1065 unsigned long subsys_bits;
1115 unsigned long flags; 1066 unsigned long flags;
1116 char *release_agent; 1067 char *release_agent;
1117 bool cpuset_clone_children; 1068 bool clone_children;
1118 char *name; 1069 char *name;
1119 /* User explicitly requested empty subsystem */ 1070 /* User explicitly requested empty subsystem */
1120 bool none; 1071 bool none;
@@ -1165,11 +1116,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1165 continue; 1116 continue;
1166 } 1117 }
1167 if (!strcmp(token, "clone_children")) { 1118 if (!strcmp(token, "clone_children")) {
1168 opts->cpuset_clone_children = true; 1119 opts->clone_children = true;
1169 continue;
1170 }
1171 if (!strcmp(token, "xattr")) {
1172 set_bit(ROOT_XATTR, &opts->flags);
1173 continue; 1120 continue;
1174 } 1121 }
1175 if (!strncmp(token, "release_agent=", 14)) { 1122 if (!strncmp(token, "release_agent=", 14)) {
@@ -1220,7 +1167,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1220 /* Mutually exclusive option 'all' + subsystem name */ 1167 /* Mutually exclusive option 'all' + subsystem name */
1221 if (all_ss) 1168 if (all_ss)
1222 return -EINVAL; 1169 return -EINVAL;
1223 set_bit(i, &opts->subsys_mask); 1170 set_bit(i, &opts->subsys_bits);
1224 one_ss = true; 1171 one_ss = true;
1225 1172
1226 break; 1173 break;
@@ -1241,7 +1188,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1241 continue; 1188 continue;
1242 if (ss->disabled) 1189 if (ss->disabled)
1243 continue; 1190 continue;
1244 set_bit(i, &opts->subsys_mask); 1191 set_bit(i, &opts->subsys_bits);
1245 } 1192 }
1246 } 1193 }
1247 1194
@@ -1253,19 +1200,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1253 * the cpuset subsystem. 1200 * the cpuset subsystem.
1254 */ 1201 */
1255 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1202 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1256 (opts->subsys_mask & mask)) 1203 (opts->subsys_bits & mask))
1257 return -EINVAL; 1204 return -EINVAL;
1258 1205
1259 1206
1260 /* Can't specify "none" and some subsystems */ 1207 /* Can't specify "none" and some subsystems */
1261 if (opts->subsys_mask && opts->none) 1208 if (opts->subsys_bits && opts->none)
1262 return -EINVAL; 1209 return -EINVAL;
1263 1210
1264 /* 1211 /*
1265 * We either have to specify by name or by subsystems. (So all 1212 * We either have to specify by name or by subsystems. (So all
1266 * empty hierarchies must have a name). 1213 * empty hierarchies must have a name).
1267 */ 1214 */
1268 if (!opts->subsys_mask && !opts->name) 1215 if (!opts->subsys_bits && !opts->name)
1269 return -EINVAL; 1216 return -EINVAL;
1270 1217
1271 /* 1218 /*
@@ -1274,10 +1221,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1274 * take duplicate reference counts on a subsystem that's already used, 1221 * take duplicate reference counts on a subsystem that's already used,
1275 * but rebind_subsystems handles this case. 1222 * but rebind_subsystems handles this case.
1276 */ 1223 */
1277 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1224 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1278 unsigned long bit = 1UL << i; 1225 unsigned long bit = 1UL << i;
1279 1226
1280 if (!(bit & opts->subsys_mask)) 1227 if (!(bit & opts->subsys_bits))
1281 continue; 1228 continue;
1282 if (!try_module_get(subsys[i]->module)) { 1229 if (!try_module_get(subsys[i]->module)) {
1283 module_pin_failed = true; 1230 module_pin_failed = true;
@@ -1290,11 +1237,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1290 * raced with a module_delete call, and to the user this is 1237 * raced with a module_delete call, and to the user this is
1291 * essentially a "subsystem doesn't exist" case. 1238 * essentially a "subsystem doesn't exist" case.
1292 */ 1239 */
1293 for (i--; i >= 0; i--) { 1240 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1294 /* drop refcounts only on the ones we took */ 1241 /* drop refcounts only on the ones we took */
1295 unsigned long bit = 1UL << i; 1242 unsigned long bit = 1UL << i;
1296 1243
1297 if (!(bit & opts->subsys_mask)) 1244 if (!(bit & opts->subsys_bits))
1298 continue; 1245 continue;
1299 module_put(subsys[i]->module); 1246 module_put(subsys[i]->module);
1300 } 1247 }
@@ -1304,13 +1251,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1304 return 0; 1251 return 0;
1305} 1252}
1306 1253
1307static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1254static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1308{ 1255{
1309 int i; 1256 int i;
1310 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1257 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1311 unsigned long bit = 1UL << i; 1258 unsigned long bit = 1UL << i;
1312 1259
1313 if (!(bit & subsys_mask)) 1260 if (!(bit & subsys_bits))
1314 continue; 1261 continue;
1315 module_put(subsys[i]->module); 1262 module_put(subsys[i]->module);
1316 } 1263 }
@@ -1322,56 +1269,37 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1322 struct cgroupfs_root *root = sb->s_fs_info; 1269 struct cgroupfs_root *root = sb->s_fs_info;
1323 struct cgroup *cgrp = &root->top_cgroup; 1270 struct cgroup *cgrp = &root->top_cgroup;
1324 struct cgroup_sb_opts opts; 1271 struct cgroup_sb_opts opts;
1325 unsigned long added_mask, removed_mask;
1326 1272
1327 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1273 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1328 mutex_lock(&cgroup_mutex); 1274 mutex_lock(&cgroup_mutex);
1329 mutex_lock(&cgroup_root_mutex);
1330 1275
1331 /* See what subsystems are wanted */ 1276 /* See what subsystems are wanted */
1332 ret = parse_cgroupfs_options(data, &opts); 1277 ret = parse_cgroupfs_options(data, &opts);
1333 if (ret) 1278 if (ret)
1334 goto out_unlock; 1279 goto out_unlock;
1335 1280
1336 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1337 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1338 task_tgid_nr(current), current->comm);
1339
1340 added_mask = opts.subsys_mask & ~root->subsys_mask;
1341 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1342
1343 /* Don't allow flags or name to change at remount */ 1281 /* Don't allow flags or name to change at remount */
1344 if (opts.flags != root->flags || 1282 if (opts.flags != root->flags ||
1345 (opts.name && strcmp(opts.name, root->name))) { 1283 (opts.name && strcmp(opts.name, root->name))) {
1346 ret = -EINVAL; 1284 ret = -EINVAL;
1347 drop_parsed_module_refcounts(opts.subsys_mask); 1285 drop_parsed_module_refcounts(opts.subsys_bits);
1348 goto out_unlock; 1286 goto out_unlock;
1349 } 1287 }
1350 1288
1351 /* 1289 ret = rebind_subsystems(root, opts.subsys_bits);
1352 * Clear out the files of subsystems that should be removed, do
1353 * this before rebind_subsystems, since rebind_subsystems may
1354 * change this hierarchy's subsys_list.
1355 */
1356 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1357
1358 ret = rebind_subsystems(root, opts.subsys_mask);
1359 if (ret) { 1290 if (ret) {
1360 /* rebind_subsystems failed, re-populate the removed files */ 1291 drop_parsed_module_refcounts(opts.subsys_bits);
1361 cgroup_populate_dir(cgrp, false, removed_mask);
1362 drop_parsed_module_refcounts(opts.subsys_mask);
1363 goto out_unlock; 1292 goto out_unlock;
1364 } 1293 }
1365 1294
1366 /* re-populate subsystem files */ 1295 /* (re)populate subsystem files */
1367 cgroup_populate_dir(cgrp, false, added_mask); 1296 cgroup_populate_dir(cgrp);
1368 1297
1369 if (opts.release_agent) 1298 if (opts.release_agent)
1370 strcpy(root->release_agent_path, opts.release_agent); 1299 strcpy(root->release_agent_path, opts.release_agent);
1371 out_unlock: 1300 out_unlock:
1372 kfree(opts.release_agent); 1301 kfree(opts.release_agent);
1373 kfree(opts.name); 1302 kfree(opts.name);
1374 mutex_unlock(&cgroup_root_mutex);
1375 mutex_unlock(&cgroup_mutex); 1303 mutex_unlock(&cgroup_mutex);
1376 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1304 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1377 return ret; 1305 return ret;
@@ -1388,29 +1316,23 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1388{ 1316{
1389 INIT_LIST_HEAD(&cgrp->sibling); 1317 INIT_LIST_HEAD(&cgrp->sibling);
1390 INIT_LIST_HEAD(&cgrp->children); 1318 INIT_LIST_HEAD(&cgrp->children);
1391 INIT_LIST_HEAD(&cgrp->files);
1392 INIT_LIST_HEAD(&cgrp->css_sets); 1319 INIT_LIST_HEAD(&cgrp->css_sets);
1393 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1320 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1321 INIT_LIST_HEAD(&cgrp->pidlists);
1396 mutex_init(&cgrp->pidlist_mutex); 1322 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1323 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1324 spin_lock_init(&cgrp->event_list_lock);
1399 simple_xattrs_init(&cgrp->xattrs);
1400} 1325}
1401 1326
1402static void init_cgroup_root(struct cgroupfs_root *root) 1327static void init_cgroup_root(struct cgroupfs_root *root)
1403{ 1328{
1404 struct cgroup *cgrp = &root->top_cgroup; 1329 struct cgroup *cgrp = &root->top_cgroup;
1405
1406 INIT_LIST_HEAD(&root->subsys_list); 1330 INIT_LIST_HEAD(&root->subsys_list);
1407 INIT_LIST_HEAD(&root->root_list); 1331 INIT_LIST_HEAD(&root->root_list);
1408 INIT_LIST_HEAD(&root->allcg_list);
1409 root->number_of_cgroups = 1; 1332 root->number_of_cgroups = 1;
1410 cgrp->root = root; 1333 cgrp->root = root;
1411 cgrp->top_cgroup = cgrp; 1334 cgrp->top_cgroup = cgrp;
1412 init_cgroup_housekeeping(cgrp); 1335 init_cgroup_housekeeping(cgrp);
1413 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1414} 1336}
1415 1337
1416static bool init_root_id(struct cgroupfs_root *root) 1338static bool init_root_id(struct cgroupfs_root *root)
@@ -1451,8 +1373,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1451 * If we asked for subsystems (or explicitly for no 1373 * If we asked for subsystems (or explicitly for no
1452 * subsystems) then they must match 1374 * subsystems) then they must match
1453 */ 1375 */
1454 if ((opts->subsys_mask || opts->none) 1376 if ((opts->subsys_bits || opts->none)
1455 && (opts->subsys_mask != root->subsys_mask)) 1377 && (opts->subsys_bits != root->subsys_bits))
1456 return 0; 1378 return 0;
1457 1379
1458 return 1; 1380 return 1;
@@ -1462,7 +1384,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1462{ 1384{
1463 struct cgroupfs_root *root; 1385 struct cgroupfs_root *root;
1464 1386
1465 if (!opts->subsys_mask && !opts->none) 1387 if (!opts->subsys_bits && !opts->none)
1466 return NULL; 1388 return NULL;
1467 1389
1468 root = kzalloc(sizeof(*root), GFP_KERNEL); 1390 root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1475,15 +1397,14 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1475 } 1397 }
1476 init_cgroup_root(root); 1398 init_cgroup_root(root);
1477 1399
1478 root->subsys_mask = opts->subsys_mask; 1400 root->subsys_bits = opts->subsys_bits;
1479 root->flags = opts->flags; 1401 root->flags = opts->flags;
1480 ida_init(&root->cgroup_ida);
1481 if (opts->release_agent) 1402 if (opts->release_agent)
1482 strcpy(root->release_agent_path, opts->release_agent); 1403 strcpy(root->release_agent_path, opts->release_agent);
1483 if (opts->name) 1404 if (opts->name)
1484 strcpy(root->name, opts->name); 1405 strcpy(root->name, opts->name);
1485 if (opts->cpuset_clone_children) 1406 if (opts->clone_children)
1486 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1407 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1487 return root; 1408 return root;
1488} 1409}
1489 1410
@@ -1496,7 +1417,6 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1496 spin_lock(&hierarchy_id_lock); 1417 spin_lock(&hierarchy_id_lock);
1497 ida_remove(&hierarchy_ida, root->hierarchy_id); 1418 ida_remove(&hierarchy_ida, root->hierarchy_id);
1498 spin_unlock(&hierarchy_id_lock); 1419 spin_unlock(&hierarchy_id_lock);
1499 ida_destroy(&root->cgroup_ida);
1500 kfree(root); 1420 kfree(root);
1501} 1421}
1502 1422
@@ -1509,7 +1429,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1509 if (!opts->new_root) 1429 if (!opts->new_root)
1510 return -EINVAL; 1430 return -EINVAL;
1511 1431
1512 BUG_ON(!opts->subsys_mask && !opts->none); 1432 BUG_ON(!opts->subsys_bits && !opts->none);
1513 1433
1514 ret = set_anon_super(sb, NULL); 1434 ret = set_anon_super(sb, NULL);
1515 if (ret) 1435 if (ret)
@@ -1535,6 +1455,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
1535 1455
1536 struct inode *inode = 1456 struct inode *inode =
1537 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1458 struct dentry *dentry;
1538 1459
1539 if (!inode) 1460 if (!inode)
1540 return -ENOMEM; 1461 return -ENOMEM;
@@ -1543,9 +1464,12 @@ static int cgroup_get_rootdir(struct super_block *sb)
1543 inode->i_op = &cgroup_dir_inode_operations; 1464 inode->i_op = &cgroup_dir_inode_operations;
1544 /* directories start off with i_nlink == 2 (for "." entry) */ 1465 /* directories start off with i_nlink == 2 (for "." entry) */
1545 inc_nlink(inode); 1466 inc_nlink(inode);
1546 sb->s_root = d_make_root(inode); 1467 dentry = d_alloc_root(inode);
1547 if (!sb->s_root) 1468 if (!dentry) {
1469 iput(inode);
1548 return -ENOMEM; 1470 return -ENOMEM;
1471 }
1472 sb->s_root = dentry;
1549 /* for everything else we want ->d_op set */ 1473 /* for everything else we want ->d_op set */
1550 sb->s_d_op = &cgroup_dops; 1474 sb->s_d_op = &cgroup_dops;
1551 return 0; 1475 return 0;
@@ -1560,7 +1484,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1560 int ret = 0; 1484 int ret = 0;
1561 struct super_block *sb; 1485 struct super_block *sb;
1562 struct cgroupfs_root *new_root; 1486 struct cgroupfs_root *new_root;
1563 struct inode *inode;
1564 1487
1565 /* First find the desired set of subsystems */ 1488 /* First find the desired set of subsystems */
1566 mutex_lock(&cgroup_mutex); 1489 mutex_lock(&cgroup_mutex);
@@ -1581,7 +1504,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1581 opts.new_root = new_root; 1504 opts.new_root = new_root;
1582 1505
1583 /* Locate an existing or new sb for this hierarchy */ 1506 /* Locate an existing or new sb for this hierarchy */
1584 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1585 if (IS_ERR(sb)) { 1508 if (IS_ERR(sb)) {
1586 ret = PTR_ERR(sb); 1509 ret = PTR_ERR(sb);
1587 cgroup_drop_root(opts.new_root); 1510 cgroup_drop_root(opts.new_root);
@@ -1594,6 +1517,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1594 /* We used the new root structure, so this is a new hierarchy */ 1517 /* We used the new root structure, so this is a new hierarchy */
1595 struct list_head tmp_cg_links; 1518 struct list_head tmp_cg_links;
1596 struct cgroup *root_cgrp = &root->top_cgroup; 1519 struct cgroup *root_cgrp = &root->top_cgroup;
1520 struct inode *inode;
1597 struct cgroupfs_root *existing_root; 1521 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1522 const struct cred *cred;
1599 int i; 1523 int i;
@@ -1607,14 +1531,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1607 1531
1608 mutex_lock(&inode->i_mutex); 1532 mutex_lock(&inode->i_mutex);
1609 mutex_lock(&cgroup_mutex); 1533 mutex_lock(&cgroup_mutex);
1610 mutex_lock(&cgroup_root_mutex);
1611 1534
1612 /* Check for name clashes with existing mounts */ 1535 if (strlen(root->name)) {
1613 ret = -EBUSY; 1536 /* Check for name clashes with existing mounts */
1614 if (strlen(root->name)) 1537 for_each_active_root(existing_root) {
1615 for_each_active_root(existing_root) 1538 if (!strcmp(existing_root->name, root->name)) {
1616 if (!strcmp(existing_root->name, root->name)) 1539 ret = -EBUSY;
1617 goto unlock_drop; 1540 mutex_unlock(&cgroup_mutex);
1541 mutex_unlock(&inode->i_mutex);
1542 goto drop_new_super;
1543 }
1544 }
1545 }
1618 1546
1619 /* 1547 /*
1620 * We're accessing css_set_count without locking 1548 * We're accessing css_set_count without locking
@@ -1624,13 +1552,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1624 * have some link structures left over 1552 * have some link structures left over
1625 */ 1553 */
1626 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1554 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1627 if (ret) 1555 if (ret) {
1628 goto unlock_drop; 1556 mutex_unlock(&cgroup_mutex);
1557 mutex_unlock(&inode->i_mutex);
1558 goto drop_new_super;
1559 }
1629 1560
1630 ret = rebind_subsystems(root, root->subsys_mask); 1561 ret = rebind_subsystems(root, root->subsys_bits);
1631 if (ret == -EBUSY) { 1562 if (ret == -EBUSY) {
1563 mutex_unlock(&cgroup_mutex);
1564 mutex_unlock(&inode->i_mutex);
1632 free_cg_links(&tmp_cg_links); 1565 free_cg_links(&tmp_cg_links);
1633 goto unlock_drop; 1566 goto drop_new_super;
1634 } 1567 }
1635 /* 1568 /*
1636 * There must be no failure case after here, since rebinding 1569 * There must be no failure case after here, since rebinding
@@ -1662,13 +1595,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1662 1595
1663 free_cg_links(&tmp_cg_links); 1596 free_cg_links(&tmp_cg_links);
1664 1597
1598 BUG_ON(!list_empty(&root_cgrp->sibling));
1665 BUG_ON(!list_empty(&root_cgrp->children)); 1599 BUG_ON(!list_empty(&root_cgrp->children));
1666 BUG_ON(root->number_of_cgroups != 1); 1600 BUG_ON(root->number_of_cgroups != 1);
1667 1601
1668 cred = override_creds(&init_cred); 1602 cred = override_creds(&init_cred);
1669 cgroup_populate_dir(root_cgrp, true, root->subsys_mask); 1603 cgroup_populate_dir(root_cgrp);
1670 revert_creds(cred); 1604 revert_creds(cred);
1671 mutex_unlock(&cgroup_root_mutex);
1672 mutex_unlock(&cgroup_mutex); 1605 mutex_unlock(&cgroup_mutex);
1673 mutex_unlock(&inode->i_mutex); 1606 mutex_unlock(&inode->i_mutex);
1674 } else { 1607 } else {
@@ -1678,21 +1611,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1678 */ 1611 */
1679 cgroup_drop_root(opts.new_root); 1612 cgroup_drop_root(opts.new_root);
1680 /* no subsys rebinding, so refcounts don't change */ 1613 /* no subsys rebinding, so refcounts don't change */
1681 drop_parsed_module_refcounts(opts.subsys_mask); 1614 drop_parsed_module_refcounts(opts.subsys_bits);
1682 } 1615 }
1683 1616
1684 kfree(opts.release_agent); 1617 kfree(opts.release_agent);
1685 kfree(opts.name); 1618 kfree(opts.name);
1686 return dget(sb->s_root); 1619 return dget(sb->s_root);
1687 1620
1688 unlock_drop:
1689 mutex_unlock(&cgroup_root_mutex);
1690 mutex_unlock(&cgroup_mutex);
1691 mutex_unlock(&inode->i_mutex);
1692 drop_new_super: 1621 drop_new_super:
1693 deactivate_locked_super(sb); 1622 deactivate_locked_super(sb);
1694 drop_modules: 1623 drop_modules:
1695 drop_parsed_module_refcounts(opts.subsys_mask); 1624 drop_parsed_module_refcounts(opts.subsys_bits);
1696 out_err: 1625 out_err:
1697 kfree(opts.release_agent); 1626 kfree(opts.release_agent);
1698 kfree(opts.name); 1627 kfree(opts.name);
@@ -1710,9 +1639,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1710 1639
1711 BUG_ON(root->number_of_cgroups != 1); 1640 BUG_ON(root->number_of_cgroups != 1);
1712 BUG_ON(!list_empty(&cgrp->children)); 1641 BUG_ON(!list_empty(&cgrp->children));
1642 BUG_ON(!list_empty(&cgrp->sibling));
1713 1643
1714 mutex_lock(&cgroup_mutex); 1644 mutex_lock(&cgroup_mutex);
1715 mutex_lock(&cgroup_root_mutex);
1716 1645
1717 /* Rebind all subsystems back to the default hierarchy */ 1646 /* Rebind all subsystems back to the default hierarchy */
1718 ret = rebind_subsystems(root, 0); 1647 ret = rebind_subsystems(root, 0);
@@ -1738,11 +1667,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1738 root_count--; 1667 root_count--;
1739 } 1668 }
1740 1669
1741 mutex_unlock(&cgroup_root_mutex);
1742 mutex_unlock(&cgroup_mutex); 1670 mutex_unlock(&cgroup_mutex);
1743 1671
1744 simple_xattrs_free(&cgrp->xattrs);
1745
1746 kill_litter_super(sb); 1672 kill_litter_super(sb);
1747 cgroup_drop_root(root); 1673 cgroup_drop_root(root);
1748} 1674}
@@ -1755,6 +1681,16 @@ static struct file_system_type cgroup_fs_type = {
1755 1681
1756static struct kobject *cgroup_kobj; 1682static struct kobject *cgroup_kobj;
1757 1683
1684static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1685{
1686 return dentry->d_fsdata;
1687}
1688
1689static inline struct cftype *__d_cft(struct dentry *dentry)
1690{
1691 return dentry->d_fsdata;
1692}
1693
1758/** 1694/**
1759 * cgroup_path - generate the path of a cgroup 1695 * cgroup_path - generate the path of a cgroup
1760 * @cgrp: the cgroup in question 1696 * @cgrp: the cgroup in question
@@ -1767,11 +1703,9 @@ static struct kobject *cgroup_kobj;
1767 */ 1703 */
1768int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1704int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1769{ 1705{
1770 struct dentry *dentry = cgrp->dentry;
1771 char *start; 1706 char *start;
1772 1707 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1708 cgroup_lock_is_held());
1774 "cgroup_path() called without proper locking");
1775 1709
1776 if (!dentry || cgrp == dummytop) { 1710 if (!dentry || cgrp == dummytop) {
1777 /* 1711 /*
@@ -1782,9 +1716,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1782 return 0; 1716 return 0;
1783 } 1717 }
1784 1718
1785 start = buf + buflen - 1; 1719 start = buf + buflen;
1786 1720
1787 *start = '\0'; 1721 *--start = '\0';
1788 for (;;) { 1722 for (;;) {
1789 int len = dentry->d_name.len; 1723 int len = dentry->d_name.len;
1790 1724
@@ -1795,7 +1729,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1795 if (!cgrp) 1729 if (!cgrp)
1796 break; 1730 break;
1797 1731
1798 dentry = cgrp->dentry; 1732 dentry = rcu_dereference_check(cgrp->dentry,
1733 cgroup_lock_is_held());
1799 if (!cgrp->parent) 1734 if (!cgrp->parent)
1800 continue; 1735 continue;
1801 if (--start < buf) 1736 if (--start < buf)
@@ -1808,104 +1743,55 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1808EXPORT_SYMBOL_GPL(cgroup_path); 1743EXPORT_SYMBOL_GPL(cgroup_path);
1809 1744
1810/* 1745/*
1811 * Control Group taskset
1812 */
1813struct task_and_cgroup {
1814 struct task_struct *task;
1815 struct cgroup *cgrp;
1816 struct css_set *cg;
1817};
1818
1819struct cgroup_taskset {
1820 struct task_and_cgroup single;
1821 struct flex_array *tc_array;
1822 int tc_array_len;
1823 int idx;
1824 struct cgroup *cur_cgrp;
1825};
1826
1827/**
1828 * cgroup_taskset_first - reset taskset and return the first task
1829 * @tset: taskset of interest
1830 *
1831 * @tset iteration is initialized and the first task is returned.
1832 */
1833struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1834{
1835 if (tset->tc_array) {
1836 tset->idx = 0;
1837 return cgroup_taskset_next(tset);
1838 } else {
1839 tset->cur_cgrp = tset->single.cgrp;
1840 return tset->single.task;
1841 }
1842}
1843EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1844
1845/**
1846 * cgroup_taskset_next - iterate to the next task in taskset
1847 * @tset: taskset of interest
1848 *
1849 * Return the next task in @tset. Iteration must have been initialized
1850 * with cgroup_taskset_first().
1851 */
1852struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1853{
1854 struct task_and_cgroup *tc;
1855
1856 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1857 return NULL;
1858
1859 tc = flex_array_get(tset->tc_array, tset->idx++);
1860 tset->cur_cgrp = tc->cgrp;
1861 return tc->task;
1862}
1863EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1864
1865/**
1866 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1867 * @tset: taskset of interest
1868 *
1869 * Return the cgroup for the current (last returned) task of @tset. This
1870 * function must be preceded by either cgroup_taskset_first() or
1871 * cgroup_taskset_next().
1872 */
1873struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1874{
1875 return tset->cur_cgrp;
1876}
1877EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1878
1879/**
1880 * cgroup_taskset_size - return the number of tasks in taskset
1881 * @tset: taskset of interest
1882 */
1883int cgroup_taskset_size(struct cgroup_taskset *tset)
1884{
1885 return tset->tc_array ? tset->tc_array_len : 1;
1886}
1887EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1888
1889
1890/*
1891 * cgroup_task_migrate - move a task from one cgroup to another. 1746 * cgroup_task_migrate - move a task from one cgroup to another.
1892 * 1747 *
1893 * Must be called with cgroup_mutex and threadgroup locked. 1748 * 'guarantee' is set if the caller promises that a new css_set for the task
1749 * will already exist. If not set, this function might sleep, and can fail with
1750 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1894 */ 1751 */
1895static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1752static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1896 struct task_struct *tsk, struct css_set *newcg) 1753 struct task_struct *tsk, bool guarantee)
1897{ 1754{
1898 struct css_set *oldcg; 1755 struct css_set *oldcg;
1756 struct css_set *newcg;
1899 1757
1900 /* 1758 /*
1901 * We are synchronized through threadgroup_lock() against PF_EXITING 1759 * get old css_set. we need to take task_lock and refcount it, because
1902 * setting such that we can't race against cgroup_exit() changing the 1760 * an exiting task can change its css_set to init_css_set and drop its
1903 * css_set to init_css_set and dropping the old one. 1761 * old one without taking cgroup_mutex.
1904 */ 1762 */
1905 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1763 task_lock(tsk);
1906 oldcg = tsk->cgroups; 1764 oldcg = tsk->cgroups;
1765 get_css_set(oldcg);
1766 task_unlock(tsk);
1767
1768 /* locate or allocate a new css_set for this task. */
1769 if (guarantee) {
1770 /* we know the css_set we want already exists. */
1771 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1772 read_lock(&css_set_lock);
1773 newcg = find_existing_css_set(oldcg, cgrp, template);
1774 BUG_ON(!newcg);
1775 get_css_set(newcg);
1776 read_unlock(&css_set_lock);
1777 } else {
1778 might_sleep();
1779 /* find_css_set will give us newcg already referenced. */
1780 newcg = find_css_set(oldcg, cgrp);
1781 if (!newcg) {
1782 put_css_set(oldcg);
1783 return -ENOMEM;
1784 }
1785 }
1786 put_css_set(oldcg);
1907 1787
1788 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1908 task_lock(tsk); 1789 task_lock(tsk);
1790 if (tsk->flags & PF_EXITING) {
1791 task_unlock(tsk);
1792 put_css_set(newcg);
1793 return -ESRCH;
1794 }
1909 rcu_assign_pointer(tsk->cgroups, newcg); 1795 rcu_assign_pointer(tsk->cgroups, newcg);
1910 task_unlock(tsk); 1796 task_unlock(tsk);
1911 1797
@@ -1920,8 +1806,10 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1920 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1806 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1921 * it here; it will be freed under RCU. 1807 * it here; it will be freed under RCU.
1922 */ 1808 */
1923 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1924 put_css_set(oldcg); 1809 put_css_set(oldcg);
1810
1811 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1812 return 0;
1925} 1813}
1926 1814
1927/** 1815/**
@@ -1929,33 +1817,25 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1929 * @cgrp: the cgroup the task is attaching to 1817 * @cgrp: the cgroup the task is attaching to
1930 * @tsk: the task to be attached 1818 * @tsk: the task to be attached
1931 * 1819 *
1932 * Call with cgroup_mutex and threadgroup locked. May take task_lock of 1820 * Call holding cgroup_mutex. May take task_lock of
1933 * @tsk during call. 1821 * the task 'tsk' during call.
1934 */ 1822 */
1935int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1823int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1936{ 1824{
1937 int retval = 0; 1825 int retval;
1938 struct cgroup_subsys *ss, *failed_ss = NULL; 1826 struct cgroup_subsys *ss, *failed_ss = NULL;
1939 struct cgroup *oldcgrp; 1827 struct cgroup *oldcgrp;
1940 struct cgroupfs_root *root = cgrp->root; 1828 struct cgroupfs_root *root = cgrp->root;
1941 struct cgroup_taskset tset = { }; 1829 struct css_set *cg;
1942 struct css_set *newcg;
1943
1944 /* @tsk either already exited or can't exit until the end */
1945 if (tsk->flags & PF_EXITING)
1946 return -ESRCH;
1947 1830
1948 /* Nothing to do if the task is already in that cgroup */ 1831 /* Nothing to do if the task is already in that cgroup */
1949 oldcgrp = task_cgroup_from_root(tsk, root); 1832 oldcgrp = task_cgroup_from_root(tsk, root);
1950 if (cgrp == oldcgrp) 1833 if (cgrp == oldcgrp)
1951 return 0; 1834 return 0;
1952 1835
1953 tset.single.task = tsk;
1954 tset.single.cgrp = oldcgrp;
1955
1956 for_each_subsys(root, ss) { 1836 for_each_subsys(root, ss) {
1957 if (ss->can_attach) { 1837 if (ss->can_attach) {
1958 retval = ss->can_attach(cgrp, &tset); 1838 retval = ss->can_attach(ss, cgrp, tsk);
1959 if (retval) { 1839 if (retval) {
1960 /* 1840 /*
1961 * Remember on which subsystem the can_attach() 1841 * Remember on which subsystem the can_attach()
@@ -1967,22 +1847,41 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1967 goto out; 1847 goto out;
1968 } 1848 }
1969 } 1849 }
1850 if (ss->can_attach_task) {
1851 retval = ss->can_attach_task(cgrp, tsk);
1852 if (retval) {
1853 failed_ss = ss;
1854 goto out;
1855 }
1856 }
1970 } 1857 }
1971 1858
1972 newcg = find_css_set(tsk->cgroups, cgrp); 1859 task_lock(tsk);
1973 if (!newcg) { 1860 cg = tsk->cgroups;
1974 retval = -ENOMEM; 1861 get_css_set(cg);
1975 goto out; 1862 task_unlock(tsk);
1976 }
1977 1863
1978 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); 1864 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1865 if (retval)
1866 goto out;
1979 1867
1980 for_each_subsys(root, ss) { 1868 for_each_subsys(root, ss) {
1869 if (ss->pre_attach)
1870 ss->pre_attach(cgrp);
1871 if (ss->attach_task)
1872 ss->attach_task(cgrp, tsk);
1981 if (ss->attach) 1873 if (ss->attach)
1982 ss->attach(cgrp, &tset); 1874 ss->attach(ss, cgrp, oldcgrp, tsk);
1983 } 1875 }
1876 set_bit(CGRP_RELEASABLE, &cgrp->flags);
1877 /* put_css_set will not destroy cg until after an RCU grace period */
1878 put_css_set(cg);
1984 1879
1985 synchronize_rcu(); 1880 /*
1881 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1882 * is no longer empty.
1883 */
1884 cgroup_wakeup_rmdir_waiter(cgrp);
1986out: 1885out:
1987 if (retval) { 1886 if (retval) {
1988 for_each_subsys(root, ss) { 1887 for_each_subsys(root, ss) {
@@ -1995,7 +1894,7 @@ out:
1995 */ 1894 */
1996 break; 1895 break;
1997 if (ss->cancel_attach) 1896 if (ss->cancel_attach)
1998 ss->cancel_attach(cgrp, &tset); 1897 ss->cancel_attach(ss, cgrp, tsk);
1999 } 1898 }
2000 } 1899 }
2001 return retval; 1900 return retval;
@@ -2025,36 +1924,111 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2025} 1924}
2026EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1925EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2027 1926
1927/*
1928 * cgroup_attach_proc works in two stages, the first of which prefetches all
1929 * new css_sets needed (to make sure we have enough memory before committing
1930 * to the move) and stores them in a list of entries of the following type.
1931 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1932 */
1933struct cg_list_entry {
1934 struct css_set *cg;
1935 struct list_head links;
1936};
1937
1938static bool css_set_check_fetched(struct cgroup *cgrp,
1939 struct task_struct *tsk, struct css_set *cg,
1940 struct list_head *newcg_list)
1941{
1942 struct css_set *newcg;
1943 struct cg_list_entry *cg_entry;
1944 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1945
1946 read_lock(&css_set_lock);
1947 newcg = find_existing_css_set(cg, cgrp, template);
1948 if (newcg)
1949 get_css_set(newcg);
1950 read_unlock(&css_set_lock);
1951
1952 /* doesn't exist at all? */
1953 if (!newcg)
1954 return false;
1955 /* see if it's already in the list */
1956 list_for_each_entry(cg_entry, newcg_list, links) {
1957 if (cg_entry->cg == newcg) {
1958 put_css_set(newcg);
1959 return true;
1960 }
1961 }
1962
1963 /* not found */
1964 put_css_set(newcg);
1965 return false;
1966}
1967
1968/*
1969 * Find the new css_set and store it in the list in preparation for moving the
1970 * given task to the given cgroup. Returns 0 or -ENOMEM.
1971 */
1972static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1973 struct list_head *newcg_list)
1974{
1975 struct css_set *newcg;
1976 struct cg_list_entry *cg_entry;
1977
1978 /* ensure a new css_set will exist for this thread */
1979 newcg = find_css_set(cg, cgrp);
1980 if (!newcg)
1981 return -ENOMEM;
1982 /* add it to the list */
1983 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1984 if (!cg_entry) {
1985 put_css_set(newcg);
1986 return -ENOMEM;
1987 }
1988 cg_entry->cg = newcg;
1989 list_add(&cg_entry->links, newcg_list);
1990 return 0;
1991}
1992
2028/** 1993/**
2029 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1994 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2030 * @cgrp: the cgroup to attach to 1995 * @cgrp: the cgroup to attach to
2031 * @leader: the threadgroup leader task_struct of the group to be attached 1996 * @leader: the threadgroup leader task_struct of the group to be attached
2032 * 1997 *
2033 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1998 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
2034 * task_lock of each thread in leader's threadgroup individually in turn. 1999 * take task_lock of each thread in leader's threadgroup individually in turn.
2035 */ 2000 */
2036static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2001int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2037{ 2002{
2038 int retval, i, group_size; 2003 int retval, i, group_size;
2039 struct cgroup_subsys *ss, *failed_ss = NULL; 2004 struct cgroup_subsys *ss, *failed_ss = NULL;
2005 bool cancel_failed_ss = false;
2040 /* guaranteed to be initialized later, but the compiler needs this */ 2006 /* guaranteed to be initialized later, but the compiler needs this */
2007 struct cgroup *oldcgrp = NULL;
2008 struct css_set *oldcg;
2041 struct cgroupfs_root *root = cgrp->root; 2009 struct cgroupfs_root *root = cgrp->root;
2042 /* threadgroup list cursor and array */ 2010 /* threadgroup list cursor and array */
2043 struct task_struct *tsk; 2011 struct task_struct *tsk;
2044 struct task_and_cgroup *tc;
2045 struct flex_array *group; 2012 struct flex_array *group;
2046 struct cgroup_taskset tset = { }; 2013 /*
2014 * we need to make sure we have css_sets for all the tasks we're
2015 * going to move -before- we actually start moving them, so that in
2016 * case we get an ENOMEM we can bail out before making any changes.
2017 */
2018 struct list_head newcg_list;
2019 struct cg_list_entry *cg_entry, *temp_nobe;
2047 2020
2048 /* 2021 /*
2049 * step 0: in order to do expensive, possibly blocking operations for 2022 * step 0: in order to do expensive, possibly blocking operations for
2050 * every thread, we cannot iterate the thread group list, since it needs 2023 * every thread, we cannot iterate the thread group list, since it needs
2051 * rcu or tasklist locked. instead, build an array of all threads in the 2024 * rcu or tasklist locked. instead, build an array of all threads in the
2052 * group - group_rwsem prevents new threads from appearing, and if 2025 * group - threadgroup_fork_lock prevents new threads from appearing,
2053 * threads exit, this will just be an over-estimate. 2026 * and if threads exit, this will just be an over-estimate.
2054 */ 2027 */
2055 group_size = get_nr_threads(leader); 2028 group_size = get_nr_threads(leader);
2056 /* flex_array supports very large thread-groups better than kmalloc. */ 2029 /* flex_array supports very large thread-groups better than kmalloc. */
2057 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 2030 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2031 GFP_KERNEL);
2058 if (!group) 2032 if (!group)
2059 return -ENOMEM; 2033 return -ENOMEM;
2060 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2034 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2062,124 +2036,189 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2062 if (retval) 2036 if (retval)
2063 goto out_free_group_list; 2037 goto out_free_group_list;
2064 2038
2039 /* prevent changes to the threadgroup list while we take a snapshot. */
2040 rcu_read_lock();
2041 if (!thread_group_leader(leader)) {
2042 /*
2043 * a race with de_thread from another thread's exec() may strip
2044 * us of our leadership, making while_each_thread unsafe to use
2045 * on this task. if this happens, there is no choice but to
2046 * throw this task away and try again (from cgroup_procs_write);
2047 * this is "double-double-toil-and-trouble-check locking".
2048 */
2049 rcu_read_unlock();
2050 retval = -EAGAIN;
2051 goto out_free_group_list;
2052 }
2053 /* take a reference on each task in the group to go in the array. */
2065 tsk = leader; 2054 tsk = leader;
2066 i = 0; 2055 i = 0;
2067 /*
2068 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2069 * already PF_EXITING could be freed from underneath us unless we
2070 * take an rcu_read_lock.
2071 */
2072 rcu_read_lock();
2073 do { 2056 do {
2074 struct task_and_cgroup ent;
2075
2076 /* @tsk either already exited or can't exit until the end */
2077 if (tsk->flags & PF_EXITING)
2078 continue;
2079
2080 /* as per above, nr_threads may decrease, but not increase. */ 2057 /* as per above, nr_threads may decrease, but not increase. */
2081 BUG_ON(i >= group_size); 2058 BUG_ON(i >= group_size);
2082 ent.task = tsk; 2059 get_task_struct(tsk);
2083 ent.cgrp = task_cgroup_from_root(tsk, root);
2084 /* nothing to do if this task is already in the cgroup */
2085 if (ent.cgrp == cgrp)
2086 continue;
2087 /* 2060 /*
2088 * saying GFP_ATOMIC has no effect here because we did prealloc 2061 * saying GFP_ATOMIC has no effect here because we did prealloc
2089 * earlier, but it's good form to communicate our expectations. 2062 * earlier, but it's good form to communicate our expectations.
2090 */ 2063 */
2091 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2064 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2092 BUG_ON(retval != 0); 2065 BUG_ON(retval != 0);
2093 i++; 2066 i++;
2094 } while_each_thread(leader, tsk); 2067 } while_each_thread(leader, tsk);
2095 rcu_read_unlock();
2096 /* remember the number of threads in the array for later. */ 2068 /* remember the number of threads in the array for later. */
2097 group_size = i; 2069 group_size = i;
2098 tset.tc_array = group; 2070 rcu_read_unlock();
2099 tset.tc_array_len = group_size;
2100
2101 /* methods shouldn't be called if no task is actually migrating */
2102 retval = 0;
2103 if (!group_size)
2104 goto out_free_group_list;
2105 2071
2106 /* 2072 /*
2107 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2108 */ 2074 */
2109 for_each_subsys(root, ss) { 2075 for_each_subsys(root, ss) {
2110 if (ss->can_attach) { 2076 if (ss->can_attach) {
2111 retval = ss->can_attach(cgrp, &tset); 2077 retval = ss->can_attach(ss, cgrp, leader);
2112 if (retval) { 2078 if (retval) {
2113 failed_ss = ss; 2079 failed_ss = ss;
2114 goto out_cancel_attach; 2080 goto out_cancel_attach;
2115 } 2081 }
2116 } 2082 }
2083 /* a callback to be run on every thread in the threadgroup. */
2084 if (ss->can_attach_task) {
2085 /* run on each task in the threadgroup. */
2086 for (i = 0; i < group_size; i++) {
2087 tsk = flex_array_get_ptr(group, i);
2088 retval = ss->can_attach_task(cgrp, tsk);
2089 if (retval) {
2090 failed_ss = ss;
2091 cancel_failed_ss = true;
2092 goto out_cancel_attach;
2093 }
2094 }
2095 }
2117 } 2096 }
2118 2097
2119 /* 2098 /*
2120 * step 2: make sure css_sets exist for all threads to be migrated. 2099 * step 2: make sure css_sets exist for all threads to be migrated.
2121 * we use find_css_set, which allocates a new one if necessary. 2100 * we use find_css_set, which allocates a new one if necessary.
2122 */ 2101 */
2102 INIT_LIST_HEAD(&newcg_list);
2123 for (i = 0; i < group_size; i++) { 2103 for (i = 0; i < group_size; i++) {
2124 tc = flex_array_get(group, i); 2104 tsk = flex_array_get_ptr(group, i);
2125 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2105 /* nothing to do if this task is already in the cgroup */
2126 if (!tc->cg) { 2106 oldcgrp = task_cgroup_from_root(tsk, root);
2127 retval = -ENOMEM; 2107 if (cgrp == oldcgrp)
2128 goto out_put_css_set_refs; 2108 continue;
2109 /* get old css_set pointer */
2110 task_lock(tsk);
2111 oldcg = tsk->cgroups;
2112 get_css_set(oldcg);
2113 task_unlock(tsk);
2114 /* see if the new one for us is already in the list? */
2115 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2116 /* was already there, nothing to do. */
2117 put_css_set(oldcg);
2118 } else {
2119 /* we don't already have it. get new one. */
2120 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2121 put_css_set(oldcg);
2122 if (retval)
2123 goto out_list_teardown;
2129 } 2124 }
2130 } 2125 }
2131 2126
2132 /* 2127 /*
2133 * step 3: now that we're guaranteed success wrt the css_sets, 2128 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2134 * proceed to move all tasks to the new cgroup. There are no 2129 * to move all tasks to the new cgroup, calling ss->attach_task for each
2135 * failure cases after here, so this is the commit point. 2130 * one along the way. there are no failure cases after here, so this is
2131 * the commit point.
2136 */ 2132 */
2133 for_each_subsys(root, ss) {
2134 if (ss->pre_attach)
2135 ss->pre_attach(cgrp);
2136 }
2137 for (i = 0; i < group_size; i++) { 2137 for (i = 0; i < group_size; i++) {
2138 tc = flex_array_get(group, i); 2138 tsk = flex_array_get_ptr(group, i);
2139 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2139 /* leave current thread as it is if it's already there */
2140 oldcgrp = task_cgroup_from_root(tsk, root);
2141 if (cgrp == oldcgrp)
2142 continue;
2143 /* attach each task to each subsystem */
2144 for_each_subsys(root, ss) {
2145 if (ss->attach_task)
2146 ss->attach_task(cgrp, tsk);
2147 }
2148 /* if the thread is PF_EXITING, it can just get skipped. */
2149 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2150 BUG_ON(retval != 0 && retval != -ESRCH);
2140 } 2151 }
2141 /* nothing is sensitive to fork() after this point. */ 2152 /* nothing is sensitive to fork() after this point. */
2142 2153
2143 /* 2154 /*
2144 * step 4: do subsystem attach callbacks. 2155 * step 4: do expensive, non-thread-specific subsystem callbacks.
2156 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2157 * being moved, this call will need to be reworked to communicate that.
2145 */ 2158 */
2146 for_each_subsys(root, ss) { 2159 for_each_subsys(root, ss) {
2147 if (ss->attach) 2160 if (ss->attach)
2148 ss->attach(cgrp, &tset); 2161 ss->attach(ss, cgrp, oldcgrp, leader);
2149 } 2162 }
2150 2163
2151 /* 2164 /*
2152 * step 5: success! and cleanup 2165 * step 5: success! and cleanup
2153 */ 2166 */
2154 synchronize_rcu(); 2167 synchronize_rcu();
2168 cgroup_wakeup_rmdir_waiter(cgrp);
2155 retval = 0; 2169 retval = 0;
2156out_put_css_set_refs: 2170out_list_teardown:
2157 if (retval) { 2171 /* clean up the list of prefetched css_sets. */
2158 for (i = 0; i < group_size; i++) { 2172 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2159 tc = flex_array_get(group, i); 2173 list_del(&cg_entry->links);
2160 if (!tc->cg) 2174 put_css_set(cg_entry->cg);
2161 break; 2175 kfree(cg_entry);
2162 put_css_set(tc->cg);
2163 }
2164 } 2176 }
2165out_cancel_attach: 2177out_cancel_attach:
2178 /* same deal as in cgroup_attach_task */
2166 if (retval) { 2179 if (retval) {
2167 for_each_subsys(root, ss) { 2180 for_each_subsys(root, ss) {
2168 if (ss == failed_ss) 2181 if (ss == failed_ss) {
2182 if (cancel_failed_ss && ss->cancel_attach)
2183 ss->cancel_attach(ss, cgrp, leader);
2169 break; 2184 break;
2185 }
2170 if (ss->cancel_attach) 2186 if (ss->cancel_attach)
2171 ss->cancel_attach(cgrp, &tset); 2187 ss->cancel_attach(ss, cgrp, leader);
2172 } 2188 }
2173 } 2189 }
2190 /* clean up the array of referenced threads in the group. */
2191 for (i = 0; i < group_size; i++) {
2192 tsk = flex_array_get_ptr(group, i);
2193 put_task_struct(tsk);
2194 }
2174out_free_group_list: 2195out_free_group_list:
2175 flex_array_free(group); 2196 flex_array_free(group);
2176 return retval; 2197 return retval;
2177} 2198}
2178 2199
2200static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
2201{
2202 struct cgroup_subsys *ss;
2203 int ret;
2204
2205 for_each_subsys(cgrp->root, ss) {
2206 if (ss->allow_attach) {
2207 ret = ss->allow_attach(cgrp, tsk);
2208 if (ret)
2209 return ret;
2210 } else {
2211 return -EACCES;
2212 }
2213 }
2214
2215 return 0;
2216}
2217
2179/* 2218/*
2180 * Find the task_struct of the task to attach by vpid and pass it along to the 2219 * Find the task_struct of the task to attach by vpid and pass it along to the
2181 * function to attach either it or all tasks in its threadgroup. Will lock 2220 * function to attach either it or all tasks in its threadgroup. Will take
2182 * cgroup_mutex and threadgroup; may take task_lock of task. 2221 * cgroup_mutex; may take task_lock of task.
2183 */ 2222 */
2184static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2223static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2185{ 2224{
@@ -2190,68 +2229,66 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2190 if (!cgroup_lock_live_group(cgrp)) 2229 if (!cgroup_lock_live_group(cgrp))
2191 return -ENODEV; 2230 return -ENODEV;
2192 2231
2193retry_find_task:
2194 rcu_read_lock();
2195 if (pid) { 2232 if (pid) {
2233 rcu_read_lock();
2196 tsk = find_task_by_vpid(pid); 2234 tsk = find_task_by_vpid(pid);
2197 if (!tsk) { 2235 if (!tsk) {
2198 rcu_read_unlock(); 2236 rcu_read_unlock();
2199 ret= -ESRCH; 2237 cgroup_unlock();
2200 goto out_unlock_cgroup; 2238 return -ESRCH;
2201 } 2239 }
2240 if (threadgroup) {
2241 /*
2242 * RCU protects this access, since tsk was found in the
2243 * tid map. a race with de_thread may cause group_leader
2244 * to stop being the leader, but cgroup_attach_proc will
2245 * detect it later.
2246 */
2247 tsk = tsk->group_leader;
2248 } else if (tsk->flags & PF_EXITING) {
2249 /* optimization for the single-task-only case */
2250 rcu_read_unlock();
2251 cgroup_unlock();
2252 return -ESRCH;
2253 }
2254
2202 /* 2255 /*
2203 * even if we're attaching all tasks in the thread group, we 2256 * even if we're attaching all tasks in the thread group, we
2204 * only need to check permissions on one of them. 2257 * only need to check permissions on one of them.
2205 */ 2258 */
2206 tcred = __task_cred(tsk); 2259 tcred = __task_cred(tsk);
2207 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2260 if (cred->euid &&
2208 !uid_eq(cred->euid, tcred->uid) && 2261 cred->euid != tcred->uid &&
2209 !uid_eq(cred->euid, tcred->suid)) { 2262 cred->euid != tcred->suid) {
2210 rcu_read_unlock(); 2263 /*
2211 ret = -EACCES; 2264 * if the default permission check fails, give each
2212 goto out_unlock_cgroup; 2265 * cgroup a chance to extend the permission check
2266 */
2267 ret = cgroup_allow_attach(cgrp, tsk);
2268 if (ret) {
2269 rcu_read_unlock();
2270 cgroup_unlock();
2271 return ret;
2272 }
2213 } 2273 }
2214 } else 2274 get_task_struct(tsk);
2215 tsk = current;
2216
2217 if (threadgroup)
2218 tsk = tsk->group_leader;
2219
2220 /*
2221 * Workqueue threads may acquire PF_THREAD_BOUND and become
2222 * trapped in a cpuset, or RT worker may be born in a cgroup
2223 * with no rt_runtime allocated. Just say no.
2224 */
2225 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2226 ret = -EINVAL;
2227 rcu_read_unlock(); 2275 rcu_read_unlock();
2228 goto out_unlock_cgroup; 2276 } else {
2277 if (threadgroup)
2278 tsk = current->group_leader;
2279 else
2280 tsk = current;
2281 get_task_struct(tsk);
2229 } 2282 }
2230 2283
2231 get_task_struct(tsk);
2232 rcu_read_unlock();
2233
2234 threadgroup_lock(tsk);
2235 if (threadgroup) { 2284 if (threadgroup) {
2236 if (!thread_group_leader(tsk)) { 2285 threadgroup_fork_write_lock(tsk);
2237 /*
2238 * a race with de_thread from another thread's exec()
2239 * may strip us of our leadership, if this happens,
2240 * there is no choice but to throw this task away and
2241 * try again; this is
2242 * "double-double-toil-and-trouble-check locking".
2243 */
2244 threadgroup_unlock(tsk);
2245 put_task_struct(tsk);
2246 goto retry_find_task;
2247 }
2248 ret = cgroup_attach_proc(cgrp, tsk); 2286 ret = cgroup_attach_proc(cgrp, tsk);
2249 } else 2287 threadgroup_fork_write_unlock(tsk);
2288 } else {
2250 ret = cgroup_attach_task(cgrp, tsk); 2289 ret = cgroup_attach_task(cgrp, tsk);
2251 threadgroup_unlock(tsk); 2290 }
2252
2253 put_task_struct(tsk); 2291 put_task_struct(tsk);
2254out_unlock_cgroup:
2255 cgroup_unlock(); 2292 cgroup_unlock();
2256 return ret; 2293 return ret;
2257} 2294}
@@ -2263,7 +2300,16 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2263 2300
2264static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2301static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2265{ 2302{
2266 return attach_task_by_pid(cgrp, tgid, true); 2303 int ret;
2304 do {
2305 /*
2306 * attach_proc fails with -EAGAIN if threadgroup leadership
2307 * changes in the middle of the operation, in which case we need
2308 * to find the task_struct for the new leader and start over.
2309 */
2310 ret = attach_task_by_pid(cgrp, tgid, true);
2311 } while (ret == -EAGAIN);
2312 return ret;
2267} 2313}
2268 2314
2269/** 2315/**
@@ -2292,9 +2338,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2292 return -EINVAL; 2338 return -EINVAL;
2293 if (!cgroup_lock_live_group(cgrp)) 2339 if (!cgroup_lock_live_group(cgrp))
2294 return -ENODEV; 2340 return -ENODEV;
2295 mutex_lock(&cgroup_root_mutex);
2296 strcpy(cgrp->root->release_agent_path, buffer); 2341 strcpy(cgrp->root->release_agent_path, buffer);
2297 mutex_unlock(&cgroup_root_mutex);
2298 cgroup_unlock(); 2342 cgroup_unlock();
2299 return 0; 2343 return 0;
2300} 2344}
@@ -2540,64 +2584,6 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2540 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2584 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2541} 2585}
2542 2586
2543static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2544{
2545 if (S_ISDIR(dentry->d_inode->i_mode))
2546 return &__d_cgrp(dentry)->xattrs;
2547 else
2548 return &__d_cft(dentry)->xattrs;
2549}
2550
2551static inline int xattr_enabled(struct dentry *dentry)
2552{
2553 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2554 return test_bit(ROOT_XATTR, &root->flags);
2555}
2556
2557static bool is_valid_xattr(const char *name)
2558{
2559 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2560 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2561 return true;
2562 return false;
2563}
2564
2565static int cgroup_setxattr(struct dentry *dentry, const char *name,
2566 const void *val, size_t size, int flags)
2567{
2568 if (!xattr_enabled(dentry))
2569 return -EOPNOTSUPP;
2570 if (!is_valid_xattr(name))
2571 return -EINVAL;
2572 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2573}
2574
2575static int cgroup_removexattr(struct dentry *dentry, const char *name)
2576{
2577 if (!xattr_enabled(dentry))
2578 return -EOPNOTSUPP;
2579 if (!is_valid_xattr(name))
2580 return -EINVAL;
2581 return simple_xattr_remove(__d_xattrs(dentry), name);
2582}
2583
2584static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2585 void *buf, size_t size)
2586{
2587 if (!xattr_enabled(dentry))
2588 return -EOPNOTSUPP;
2589 if (!is_valid_xattr(name))
2590 return -EINVAL;
2591 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2592}
2593
2594static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2595{
2596 if (!xattr_enabled(dentry))
2597 return -EOPNOTSUPP;
2598 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2599}
2600
2601static const struct file_operations cgroup_file_operations = { 2587static const struct file_operations cgroup_file_operations = {
2602 .read = cgroup_file_read, 2588 .read = cgroup_file_read,
2603 .write = cgroup_file_write, 2589 .write = cgroup_file_write,
@@ -2606,25 +2592,14 @@ static const struct file_operations cgroup_file_operations = {
2606 .release = cgroup_file_release, 2592 .release = cgroup_file_release,
2607}; 2593};
2608 2594
2609static const struct inode_operations cgroup_file_inode_operations = {
2610 .setxattr = cgroup_setxattr,
2611 .getxattr = cgroup_getxattr,
2612 .listxattr = cgroup_listxattr,
2613 .removexattr = cgroup_removexattr,
2614};
2615
2616static const struct inode_operations cgroup_dir_inode_operations = { 2595static const struct inode_operations cgroup_dir_inode_operations = {
2617 .lookup = cgroup_lookup, 2596 .lookup = cgroup_lookup,
2618 .mkdir = cgroup_mkdir, 2597 .mkdir = cgroup_mkdir,
2619 .rmdir = cgroup_rmdir, 2598 .rmdir = cgroup_rmdir,
2620 .rename = cgroup_rename, 2599 .rename = cgroup_rename,
2621 .setxattr = cgroup_setxattr,
2622 .getxattr = cgroup_getxattr,
2623 .listxattr = cgroup_listxattr,
2624 .removexattr = cgroup_removexattr,
2625}; 2600};
2626 2601
2627static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2602static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2628{ 2603{
2629 if (dentry->d_name.len > NAME_MAX) 2604 if (dentry->d_name.len > NAME_MAX)
2630 return ERR_PTR(-ENAMETOOLONG); 2605 return ERR_PTR(-ENAMETOOLONG);
@@ -2642,7 +2617,7 @@ static inline struct cftype *__file_cft(struct file *file)
2642 return __d_cft(file->f_dentry); 2617 return __d_cft(file->f_dentry);
2643} 2618}
2644 2619
2645static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2620static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2646 struct super_block *sb) 2621 struct super_block *sb)
2647{ 2622{
2648 struct inode *inode; 2623 struct inode *inode;
@@ -2662,27 +2637,45 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2662 2637
2663 /* start off with i_nlink == 2 (for "." entry) */ 2638 /* start off with i_nlink == 2 (for "." entry) */
2664 inc_nlink(inode); 2639 inc_nlink(inode);
2665 inc_nlink(dentry->d_parent->d_inode);
2666 2640
2667 /* 2641 /* start with the directory inode held, so that we can
2668 * Control reaches here with cgroup_mutex held. 2642 * populate it without racing with another mkdir */
2669 * @inode->i_mutex should nest outside cgroup_mutex but we 2643 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2670 * want to populate it immediately without releasing
2671 * cgroup_mutex. As @inode isn't visible to anyone else
2672 * yet, trylock will always succeed without affecting
2673 * lockdep checks.
2674 */
2675 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2676 } else if (S_ISREG(mode)) { 2644 } else if (S_ISREG(mode)) {
2677 inode->i_size = 0; 2645 inode->i_size = 0;
2678 inode->i_fop = &cgroup_file_operations; 2646 inode->i_fop = &cgroup_file_operations;
2679 inode->i_op = &cgroup_file_inode_operations;
2680 } 2647 }
2681 d_instantiate(dentry, inode); 2648 d_instantiate(dentry, inode);
2682 dget(dentry); /* Extra count - pin the dentry in core */ 2649 dget(dentry); /* Extra count - pin the dentry in core */
2683 return 0; 2650 return 0;
2684} 2651}
2685 2652
2653/*
2654 * cgroup_create_dir - create a directory for an object.
2655 * @cgrp: the cgroup we create the directory for. It must have a valid
2656 * ->parent field. And we are going to fill its ->dentry field.
2657 * @dentry: dentry of the new cgroup
2658 * @mode: mode to set on new directory.
2659 */
2660static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2661 mode_t mode)
2662{
2663 struct dentry *parent;
2664 int error = 0;
2665
2666 parent = cgrp->parent->dentry;
2667 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2668 if (!error) {
2669 dentry->d_fsdata = cgrp;
2670 inc_nlink(parent->d_inode);
2671 rcu_assign_pointer(cgrp->dentry, dentry);
2672 dget(dentry);
2673 }
2674 dput(dentry);
2675
2676 return error;
2677}
2678
2686/** 2679/**
2687 * cgroup_file_mode - deduce file mode of a control file 2680 * cgroup_file_mode - deduce file mode of a control file
2688 * @cft: the control file in question 2681 * @cft: the control file in question
@@ -2692,9 +2685,9 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2692 * returns S_IRUGO if it has only a read handler 2685 * returns S_IRUGO if it has only a read handler
2693 * returns S_IWUSR if it has only a write hander 2686 * returns S_IWUSR if it has only a write hander
2694 */ 2687 */
2695static umode_t cgroup_file_mode(const struct cftype *cft) 2688static mode_t cgroup_file_mode(const struct cftype *cft)
2696{ 2689{
2697 umode_t mode = 0; 2690 mode_t mode = 0;
2698 2691
2699 if (cft->mode) 2692 if (cft->mode)
2700 return cft->mode; 2693 return cft->mode;
@@ -2710,193 +2703,50 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2710 return mode; 2703 return mode;
2711} 2704}
2712 2705
2713static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2706int cgroup_add_file(struct cgroup *cgrp,
2714 struct cftype *cft) 2707 struct cgroup_subsys *subsys,
2708 const struct cftype *cft)
2715{ 2709{
2716 struct dentry *dir = cgrp->dentry; 2710 struct dentry *dir = cgrp->dentry;
2717 struct cgroup *parent = __d_cgrp(dir);
2718 struct dentry *dentry; 2711 struct dentry *dentry;
2719 struct cfent *cfe;
2720 int error; 2712 int error;
2721 umode_t mode; 2713 mode_t mode;
2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2723
2724 simple_xattrs_init(&cft->xattrs);
2725 2714
2715 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2726 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2716 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2727 strcpy(name, subsys->name); 2717 strcpy(name, subsys->name);
2728 strcat(name, "."); 2718 strcat(name, ".");
2729 } 2719 }
2730 strcat(name, cft->name); 2720 strcat(name, cft->name);
2731
2732 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2721 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2733
2734 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2735 if (!cfe)
2736 return -ENOMEM;
2737
2738 dentry = lookup_one_len(name, dir, strlen(name)); 2722 dentry = lookup_one_len(name, dir, strlen(name));
2739 if (IS_ERR(dentry)) { 2723 if (!IS_ERR(dentry)) {
2724 mode = cgroup_file_mode(cft);
2725 error = cgroup_create_file(dentry, mode | S_IFREG,
2726 cgrp->root->sb);
2727 if (!error)
2728 dentry->d_fsdata = (void *)cft;
2729 dput(dentry);
2730 } else
2740 error = PTR_ERR(dentry); 2731 error = PTR_ERR(dentry);
2741 goto out;
2742 }
2743
2744 mode = cgroup_file_mode(cft);
2745 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2746 if (!error) {
2747 cfe->type = (void *)cft;
2748 cfe->dentry = dentry;
2749 dentry->d_fsdata = cfe;
2750 list_add_tail(&cfe->node, &parent->files);
2751 cfe = NULL;
2752 }
2753 dput(dentry);
2754out:
2755 kfree(cfe);
2756 return error; 2732 return error;
2757} 2733}
2734EXPORT_SYMBOL_GPL(cgroup_add_file);
2758 2735
2759static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2736int cgroup_add_files(struct cgroup *cgrp,
2760 struct cftype cfts[], bool is_add) 2737 struct cgroup_subsys *subsys,
2761{ 2738 const struct cftype cft[],
2762 struct cftype *cft; 2739 int count)
2763 int err, ret = 0;
2764
2765 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2766 /* does cft->flags tell us to skip this file on @cgrp? */
2767 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2768 continue;
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue;
2771
2772 if (is_add)
2773 err = cgroup_add_file(cgrp, subsys, cft);
2774 else
2775 err = cgroup_rm_file(cgrp, cft);
2776 if (err) {
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err;
2780 }
2781 }
2782 return ret;
2783}
2784
2785static DEFINE_MUTEX(cgroup_cft_mutex);
2786
2787static void cgroup_cfts_prepare(void)
2788 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2789{ 2740{
2790 /* 2741 int i, err;
2791 * Thanks to the entanglement with vfs inode locking, we can't walk 2742 for (i = 0; i < count; i++) {
2792 * the existing cgroups under cgroup_mutex and create files. 2743 err = cgroup_add_file(cgrp, subsys, &cft[i]);
2793 * Instead, we increment reference on all cgroups and build list of 2744 if (err)
2794 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2745 return err;
2795 * exclusive access to the field.
2796 */
2797 mutex_lock(&cgroup_cft_mutex);
2798 mutex_lock(&cgroup_mutex);
2799}
2800
2801static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2802 struct cftype *cfts, bool is_add)
2803 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2804{
2805 LIST_HEAD(pending);
2806 struct cgroup *cgrp, *n;
2807
2808 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2809 if (cfts && ss->root != &rootnode) {
2810 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2811 dget(cgrp->dentry);
2812 list_add_tail(&cgrp->cft_q_node, &pending);
2813 }
2814 }
2815
2816 mutex_unlock(&cgroup_mutex);
2817
2818 /*
2819 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2820 * files for all cgroups which were created before.
2821 */
2822 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2823 struct inode *inode = cgrp->dentry->d_inode;
2824
2825 mutex_lock(&inode->i_mutex);
2826 mutex_lock(&cgroup_mutex);
2827 if (!cgroup_is_removed(cgrp))
2828 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2829 mutex_unlock(&cgroup_mutex);
2830 mutex_unlock(&inode->i_mutex);
2831
2832 list_del_init(&cgrp->cft_q_node);
2833 dput(cgrp->dentry);
2834 } 2746 }
2835
2836 mutex_unlock(&cgroup_cft_mutex);
2837}
2838
2839/**
2840 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2841 * @ss: target cgroup subsystem
2842 * @cfts: zero-length name terminated array of cftypes
2843 *
2844 * Register @cfts to @ss. Files described by @cfts are created for all
2845 * existing cgroups to which @ss is attached and all future cgroups will
2846 * have them too. This function can be called anytime whether @ss is
2847 * attached or not.
2848 *
2849 * Returns 0 on successful registration, -errno on failure. Note that this
2850 * function currently returns 0 as long as @cfts registration is successful
2851 * even if some file creation attempts on existing cgroups fail.
2852 */
2853int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2854{
2855 struct cftype_set *set;
2856
2857 set = kzalloc(sizeof(*set), GFP_KERNEL);
2858 if (!set)
2859 return -ENOMEM;
2860
2861 cgroup_cfts_prepare();
2862 set->cfts = cfts;
2863 list_add_tail(&set->node, &ss->cftsets);
2864 cgroup_cfts_commit(ss, cfts, true);
2865
2866 return 0; 2747 return 0;
2867} 2748}
2868EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2749EXPORT_SYMBOL_GPL(cgroup_add_files);
2869
2870/**
2871 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2872 * @ss: target cgroup subsystem
2873 * @cfts: zero-length name terminated array of cftypes
2874 *
2875 * Unregister @cfts from @ss. Files described by @cfts are removed from
2876 * all existing cgroups to which @ss is attached and all future cgroups
2877 * won't have them either. This function can be called anytime whether @ss
2878 * is attached or not.
2879 *
2880 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2881 * registered with @ss.
2882 */
2883int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2884{
2885 struct cftype_set *set;
2886
2887 cgroup_cfts_prepare();
2888
2889 list_for_each_entry(set, &ss->cftsets, node) {
2890 if (set->cfts == cfts) {
2891 list_del_init(&set->node);
2892 cgroup_cfts_commit(ss, cfts, false);
2893 return 0;
2894 }
2895 }
2896
2897 cgroup_cfts_commit(ss, NULL, false);
2898 return -ENOENT;
2899}
2900 2750
2901/** 2751/**
2902 * cgroup_task_count - count the number of tasks in a cgroup. 2752 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2947,20 +2797,15 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2947 * using their cgroups capability, we don't maintain the lists running 2797 * using their cgroups capability, we don't maintain the lists running
2948 * through each css_set to its tasks until we see the list actually 2798 * through each css_set to its tasks until we see the list actually
2949 * used - in other words after the first call to cgroup_iter_start(). 2799 * used - in other words after the first call to cgroup_iter_start().
2800 *
2801 * The tasklist_lock is not held here, as do_each_thread() and
2802 * while_each_thread() are protected by RCU.
2950 */ 2803 */
2951static void cgroup_enable_task_cg_lists(void) 2804static void cgroup_enable_task_cg_lists(void)
2952{ 2805{
2953 struct task_struct *p, *g; 2806 struct task_struct *p, *g;
2954 write_lock(&css_set_lock); 2807 write_lock(&css_set_lock);
2955 use_task_css_set_links = 1; 2808 use_task_css_set_links = 1;
2956 /*
2957 * We need tasklist_lock because RCU is not safe against
2958 * while_each_thread(). Besides, a forking task that has passed
2959 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2960 * is not guaranteed to have its child immediately visible in the
2961 * tasklist if we walk through it with RCU.
2962 */
2963 read_lock(&tasklist_lock);
2964 do_each_thread(g, p) { 2809 do_each_thread(g, p) {
2965 task_lock(p); 2810 task_lock(p);
2966 /* 2811 /*
@@ -2972,98 +2817,10 @@ static void cgroup_enable_task_cg_lists(void)
2972 list_add(&p->cg_list, &p->cgroups->tasks); 2817 list_add(&p->cg_list, &p->cgroups->tasks);
2973 task_unlock(p); 2818 task_unlock(p);
2974 } while_each_thread(g, p); 2819 } while_each_thread(g, p);
2975 read_unlock(&tasklist_lock);
2976 write_unlock(&css_set_lock); 2820 write_unlock(&css_set_lock);
2977} 2821}
2978 2822
2979/**
2980 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2981 * @pos: the current position (%NULL to initiate traversal)
2982 * @cgroup: cgroup whose descendants to walk
2983 *
2984 * To be used by cgroup_for_each_descendant_pre(). Find the next
2985 * descendant to visit for pre-order traversal of @cgroup's descendants.
2986 */
2987struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2988 struct cgroup *cgroup)
2989{
2990 struct cgroup *next;
2991
2992 WARN_ON_ONCE(!rcu_read_lock_held());
2993
2994 /* if first iteration, pretend we just visited @cgroup */
2995 if (!pos) {
2996 if (list_empty(&cgroup->children))
2997 return NULL;
2998 pos = cgroup;
2999 }
3000
3001 /* visit the first child if exists */
3002 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3003 if (next)
3004 return next;
3005
3006 /* no child, visit my or the closest ancestor's next sibling */
3007 do {
3008 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3009 sibling);
3010 if (&next->sibling != &pos->parent->children)
3011 return next;
3012
3013 pos = pos->parent;
3014 } while (pos != cgroup);
3015
3016 return NULL;
3017}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{
3022 struct cgroup *last;
3023
3024 do {
3025 last = pos;
3026 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3027 sibling);
3028 } while (pos);
3029
3030 return last;
3031}
3032
3033/**
3034 * cgroup_next_descendant_post - find the next descendant for post-order walk
3035 * @pos: the current position (%NULL to initiate traversal)
3036 * @cgroup: cgroup whose descendants to walk
3037 *
3038 * To be used by cgroup_for_each_descendant_post(). Find the next
3039 * descendant to visit for post-order traversal of @cgroup's descendants.
3040 */
3041struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3042 struct cgroup *cgroup)
3043{
3044 struct cgroup *next;
3045
3046 WARN_ON_ONCE(!rcu_read_lock_held());
3047
3048 /* if first iteration, visit the leftmost descendant */
3049 if (!pos) {
3050 next = cgroup_leftmost_descendant(cgroup);
3051 return next != cgroup ? next : NULL;
3052 }
3053
3054 /* if there's an unvisited sibling, visit its leftmost descendant */
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return cgroup_leftmost_descendant(next);
3058
3059 /* no sibling left, visit parent */
3060 next = pos->parent;
3061 return next != cgroup ? next : NULL;
3062}
3063EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3064
3065void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2823void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3066 __acquires(css_set_lock)
3067{ 2824{
3068 /* 2825 /*
3069 * The first time anyone tries to iterate across a cgroup, 2826 * The first time anyone tries to iterate across a cgroup,
@@ -3103,7 +2860,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3103} 2860}
3104 2861
3105void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2862void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
3106 __releases(css_set_lock)
3107{ 2863{
3108 read_unlock(&css_set_lock); 2864 read_unlock(&css_set_lock);
3109} 2865}
@@ -3278,38 +3034,6 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3278 * 3034 *
3279 */ 3035 */
3280 3036
3281/* which pidlist file are we talking about? */
3282enum cgroup_filetype {
3283 CGROUP_FILE_PROCS,
3284 CGROUP_FILE_TASKS,
3285};
3286
3287/*
3288 * A pidlist is a list of pids that virtually represents the contents of one
3289 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3290 * a pair (one each for procs, tasks) for each pid namespace that's relevant
3291 * to the cgroup.
3292 */
3293struct cgroup_pidlist {
3294 /*
3295 * used to find which pidlist is wanted. doesn't change as long as
3296 * this particular list stays in the list.
3297 */
3298 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3299 /* array of xids */
3300 pid_t *list;
3301 /* how many elements the above list has */
3302 int length;
3303 /* how many files are using the current array */
3304 int use_count;
3305 /* each of these stored in a list by its cgroup */
3306 struct list_head links;
3307 /* pointer to the cgroup we belong to, for list removal purposes */
3308 struct cgroup *owner;
3309 /* protects the other fields */
3310 struct rw_semaphore mutex;
3311};
3312
3313/* 3037/*
3314 * The following two functions "fix" the issue where there are more pids 3038 * The following two functions "fix" the issue where there are more pids
3315 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 3039 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3408,7 +3132,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3408{ 3132{
3409 struct cgroup_pidlist *l; 3133 struct cgroup_pidlist *l;
3410 /* don't need task_nsproxy() if we're looking at ourself */ 3134 /* don't need task_nsproxy() if we're looking at ourself */
3411 struct pid_namespace *ns = task_active_pid_ns(current); 3135 struct pid_namespace *ns = current->nsproxy->pid_ns;
3412 3136
3413 /* 3137 /*
3414 * We can't drop the pidlist_mutex before taking the l->mutex in case 3138 * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3775,7 +3499,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3775 if (flags & POLLHUP) { 3499 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait); 3500 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock); 3501 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list); 3502 list_del(&event->list);
3779 spin_unlock(&cgrp->event_list_lock); 3503 spin_unlock(&cgrp->event_list_lock);
3780 /* 3504 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3505 * We are in atomic context, but cgroup_event_remove() may
@@ -3912,7 +3636,7 @@ fail:
3912static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3636static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3913 struct cftype *cft) 3637 struct cftype *cft)
3914{ 3638{
3915 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3639 return clone_children(cgrp);
3916} 3640}
3917 3641
3918static int cgroup_clone_children_write(struct cgroup *cgrp, 3642static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3920,9 +3644,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3920 u64 val) 3644 u64 val)
3921{ 3645{
3922 if (val) 3646 if (val)
3923 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3647 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3924 else 3648 else
3925 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3649 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3926 return 0; 3650 return 0;
3927} 3651}
3928 3652
@@ -3961,44 +3685,36 @@ static struct cftype files[] = {
3961 .read_u64 = cgroup_clone_children_read, 3685 .read_u64 = cgroup_clone_children_read,
3962 .write_u64 = cgroup_clone_children_write, 3686 .write_u64 = cgroup_clone_children_write,
3963 }, 3687 },
3964 {
3965 .name = "release_agent",
3966 .flags = CFTYPE_ONLY_ON_ROOT,
3967 .read_seq_string = cgroup_release_agent_show,
3968 .write_string = cgroup_release_agent_write,
3969 .max_write_len = PATH_MAX,
3970 },
3971 { } /* terminate */
3972}; 3688};
3973 3689
3974/** 3690static struct cftype cft_release_agent = {
3975 * cgroup_populate_dir - selectively creation of files in a directory 3691 .name = "release_agent",
3976 * @cgrp: target cgroup 3692 .read_seq_string = cgroup_release_agent_show,
3977 * @base_files: true if the base files should be added 3693 .write_string = cgroup_release_agent_write,
3978 * @subsys_mask: mask of the subsystem ids whose files should be added 3694 .max_write_len = PATH_MAX,
3979 */ 3695};
3980static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 3696
3981 unsigned long subsys_mask) 3697static int cgroup_populate_dir(struct cgroup *cgrp)
3982{ 3698{
3983 int err; 3699 int err;
3984 struct cgroup_subsys *ss; 3700 struct cgroup_subsys *ss;
3985 3701
3986 if (base_files) { 3702 /* First clear out any existing files */
3987 err = cgroup_addrm_files(cgrp, NULL, files, true); 3703 cgroup_clear_directory(cgrp->dentry);
3988 if (err < 0) 3704
3705 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3706 if (err < 0)
3707 return err;
3708
3709 if (cgrp == cgrp->top_cgroup) {
3710 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3989 return err; 3711 return err;
3990 } 3712 }
3991 3713
3992 /* process cftsets of each subsystem */
3993 for_each_subsys(cgrp->root, ss) { 3714 for_each_subsys(cgrp->root, ss) {
3994 struct cftype_set *set; 3715 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
3995 if (!test_bit(ss->subsys_id, &subsys_mask)) 3716 return err;
3996 continue;
3997
3998 list_for_each_entry(set, &ss->cftsets, node)
3999 cgroup_addrm_files(cgrp, ss, set->cfts, true);
4000 } 3717 }
4001
4002 /* This cgroup is ready now */ 3718 /* This cgroup is ready now */
4003 for_each_subsys(cgrp->root, ss) { 3719 for_each_subsys(cgrp->root, ss) {
4004 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3720 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -4014,18 +3730,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4014 return 0; 3730 return 0;
4015} 3731}
4016 3732
4017static void css_dput_fn(struct work_struct *work)
4018{
4019 struct cgroup_subsys_state *css =
4020 container_of(work, struct cgroup_subsys_state, dput_work);
4021 struct dentry *dentry = css->cgroup->dentry;
4022 struct super_block *sb = dentry->d_sb;
4023
4024 atomic_inc(&sb->s_active);
4025 dput(dentry);
4026 deactivate_super(sb);
4027}
4028
4029static void init_cgroup_css(struct cgroup_subsys_state *css, 3733static void init_cgroup_css(struct cgroup_subsys_state *css,
4030 struct cgroup_subsys *ss, 3734 struct cgroup_subsys *ss,
4031 struct cgroup *cgrp) 3735 struct cgroup *cgrp)
@@ -4035,57 +3739,40 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4035 css->flags = 0; 3739 css->flags = 0;
4036 css->id = NULL; 3740 css->id = NULL;
4037 if (cgrp == dummytop) 3741 if (cgrp == dummytop)
4038 css->flags |= CSS_ROOT; 3742 set_bit(CSS_ROOT, &css->flags);
4039 BUG_ON(cgrp->subsys[ss->subsys_id]); 3743 BUG_ON(cgrp->subsys[ss->subsys_id]);
4040 cgrp->subsys[ss->subsys_id] = css; 3744 cgrp->subsys[ss->subsys_id] = css;
4041
4042 /*
4043 * css holds an extra ref to @cgrp->dentry which is put on the last
4044 * css_put(). dput() requires process context, which css_put() may
4045 * be called without. @css->dput_work will be used to invoke
4046 * dput() asynchronously from css_put().
4047 */
4048 INIT_WORK(&css->dput_work, css_dput_fn);
4049} 3745}
4050 3746
4051/* invoke ->post_create() on a new CSS and mark it online if successful */ 3747static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
4052static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4053{ 3748{
4054 int ret = 0; 3749 /* We need to take each hierarchy_mutex in a consistent order */
4055 3750 int i;
4056 lockdep_assert_held(&cgroup_mutex);
4057 3751
4058 if (ss->css_online) 3752 /*
4059 ret = ss->css_online(cgrp); 3753 * No worry about a race with rebind_subsystems that might mess up the
4060 if (!ret) 3754 * locking order, since both parties are under cgroup_mutex.
4061 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 3755 */
4062 return ret; 3756 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3757 struct cgroup_subsys *ss = subsys[i];
3758 if (ss == NULL)
3759 continue;
3760 if (ss->root == root)
3761 mutex_lock(&ss->hierarchy_mutex);
3762 }
4063} 3763}
4064 3764
4065/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 3765static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
4066static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4067 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4068{ 3766{
4069 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3767 int i;
4070
4071 lockdep_assert_held(&cgroup_mutex);
4072
4073 if (!(css->flags & CSS_ONLINE))
4074 return;
4075 3768
4076 /* 3769 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4077 * css_offline() should be called with cgroup_mutex unlocked. See 3770 struct cgroup_subsys *ss = subsys[i];
4078 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for 3771 if (ss == NULL)
4079 * details. This temporary unlocking should go away once 3772 continue;
4080 * cgroup_mutex is unexported from controllers. 3773 if (ss->root == root)
4081 */ 3774 mutex_unlock(&ss->hierarchy_mutex);
4082 if (ss->css_offline) {
4083 mutex_unlock(&cgroup_mutex);
4084 ss->css_offline(cgrp);
4085 mutex_lock(&cgroup_mutex);
4086 } 3775 }
4087
4088 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4089} 3776}
4090 3777
4091/* 3778/*
@@ -4097,7 +3784,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4097 * Must be called with the mutex on the parent inode held 3784 * Must be called with the mutex on the parent inode held
4098 */ 3785 */
4099static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3786static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4100 umode_t mode) 3787 mode_t mode)
4101{ 3788{
4102 struct cgroup *cgrp; 3789 struct cgroup *cgrp;
4103 struct cgroupfs_root *root = parent->root; 3790 struct cgroupfs_root *root = parent->root;
@@ -4105,27 +3792,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4105 struct cgroup_subsys *ss; 3792 struct cgroup_subsys *ss;
4106 struct super_block *sb = root->sb; 3793 struct super_block *sb = root->sb;
4107 3794
4108 /* allocate the cgroup and its ID, 0 is reserved for the root */
4109 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3795 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4110 if (!cgrp) 3796 if (!cgrp)
4111 return -ENOMEM; 3797 return -ENOMEM;
4112 3798
4113 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4114 if (cgrp->id < 0)
4115 goto err_free_cgrp;
4116
4117 /*
4118 * Only live parents can have children. Note that the liveliness
4119 * check isn't strictly necessary because cgroup_mkdir() and
4120 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4121 * anyway so that locking is contained inside cgroup proper and we
4122 * don't get nasty surprises if we ever grow another caller.
4123 */
4124 if (!cgroup_lock_live_group(parent)) {
4125 err = -ENODEV;
4126 goto err_free_id;
4127 }
4128
4129 /* Grab a reference on the superblock so the hierarchy doesn't 3799 /* Grab a reference on the superblock so the hierarchy doesn't
4130 * get deleted on unmount if there are child cgroups. This 3800 * get deleted on unmount if there are child cgroups. This
4131 * can be done outside cgroup_mutex, since the sb can't 3801 * can be done outside cgroup_mutex, since the sb can't
@@ -4133,6 +3803,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4133 * fs */ 3803 * fs */
4134 atomic_inc(&sb->s_active); 3804 atomic_inc(&sb->s_active);
4135 3805
3806 mutex_lock(&cgroup_mutex);
3807
4136 init_cgroup_housekeeping(cgrp); 3808 init_cgroup_housekeeping(cgrp);
4137 3809
4138 cgrp->parent = parent; 3810 cgrp->parent = parent;
@@ -4142,93 +3814,73 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4142 if (notify_on_release(parent)) 3814 if (notify_on_release(parent))
4143 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3815 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4144 3816
4145 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3817 if (clone_children(parent))
4146 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3818 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
4147 3819
4148 for_each_subsys(root, ss) { 3820 for_each_subsys(root, ss) {
4149 struct cgroup_subsys_state *css; 3821 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
4150 3822
4151 css = ss->css_alloc(cgrp);
4152 if (IS_ERR(css)) { 3823 if (IS_ERR(css)) {
4153 err = PTR_ERR(css); 3824 err = PTR_ERR(css);
4154 goto err_free_all; 3825 goto err_destroy;
4155 } 3826 }
4156 init_cgroup_css(css, ss, cgrp); 3827 init_cgroup_css(css, ss, cgrp);
4157 if (ss->use_id) { 3828 if (ss->use_id) {
4158 err = alloc_css_id(ss, parent, cgrp); 3829 err = alloc_css_id(ss, parent, cgrp);
4159 if (err) 3830 if (err)
4160 goto err_free_all; 3831 goto err_destroy;
4161 } 3832 }
3833 /* At error, ->destroy() callback has to free assigned ID. */
3834 if (clone_children(parent) && ss->post_clone)
3835 ss->post_clone(ss, cgrp);
4162 } 3836 }
4163 3837
4164 /* 3838 cgroup_lock_hierarchy(root);
4165 * Create directory. cgroup_create_file() returns with the new 3839 list_add(&cgrp->sibling, &cgrp->parent->children);
4166 * directory locked on success so that it can be populated without 3840 cgroup_unlock_hierarchy(root);
4167 * dropping cgroup_mutex.
4168 */
4169 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4170 if (err < 0)
4171 goto err_free_all;
4172 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173
4174 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 3841 root->number_of_cgroups++;
4180 3842
4181 /* each css holds a ref to the cgroup's dentry */ 3843 err = cgroup_create_dir(cgrp, dentry, mode);
4182 for_each_subsys(root, ss) 3844 if (err < 0)
4183 dget(dentry); 3845 goto err_remove;
4184 3846
4185 /* creation succeeded, notify subsystems */ 3847 set_bit(CGRP_RELEASABLE, &parent->flags);
4186 for_each_subsys(root, ss) {
4187 err = online_css(ss, cgrp);
4188 if (err)
4189 goto err_destroy;
4190 3848
4191 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3849 /* The cgroup directory was pre-locked for us */
4192 parent->parent) { 3850 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4193 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4194 current->comm, current->pid, ss->name);
4195 if (!strcmp(ss->name, "memory"))
4196 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4197 ss->warned_broken_hierarchy = true;
4198 }
4199 }
4200 3851
4201 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 3852 err = cgroup_populate_dir(cgrp);
4202 if (err) 3853 /* If err < 0, we have a half-filled directory - oh well ;) */
4203 goto err_destroy;
4204 3854
4205 mutex_unlock(&cgroup_mutex); 3855 mutex_unlock(&cgroup_mutex);
4206 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3856 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4207 3857
4208 return 0; 3858 return 0;
4209 3859
4210err_free_all: 3860 err_remove:
3861
3862 cgroup_lock_hierarchy(root);
3863 list_del(&cgrp->sibling);
3864 cgroup_unlock_hierarchy(root);
3865 root->number_of_cgroups--;
3866
3867 err_destroy:
3868
4211 for_each_subsys(root, ss) { 3869 for_each_subsys(root, ss) {
4212 if (cgrp->subsys[ss->subsys_id]) 3870 if (cgrp->subsys[ss->subsys_id])
4213 ss->css_free(cgrp); 3871 ss->destroy(ss, cgrp);
4214 } 3872 }
3873
4215 mutex_unlock(&cgroup_mutex); 3874 mutex_unlock(&cgroup_mutex);
3875
4216 /* Release the reference count that we took on the superblock */ 3876 /* Release the reference count that we took on the superblock */
4217 deactivate_super(sb); 3877 deactivate_super(sb);
4218err_free_id:
4219 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4220err_free_cgrp:
4221 kfree(cgrp);
4222 return err;
4223 3878
4224err_destroy: 3879 kfree(cgrp);
4225 cgroup_destroy_locked(cgrp);
4226 mutex_unlock(&cgroup_mutex);
4227 mutex_unlock(&dentry->d_inode->i_mutex);
4228 return err; 3880 return err;
4229} 3881}
4230 3882
4231static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3883static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4232{ 3884{
4233 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3885 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
4234 3886
@@ -4236,19 +3888,18 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4236 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3888 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4237} 3889}
4238 3890
4239/*
4240 * Check the reference count on each subsystem. Since we already
4241 * established that there are no tasks in the cgroup, if the css refcount
4242 * is also 1, then there should be no outstanding references, so the
4243 * subsystem is safe to destroy. We scan across all subsystems rather than
4244 * using the per-hierarchy linked list of mounted subsystems since we can
4245 * be called via check_for_release() with no synchronization other than
4246 * RCU, and the subsystem linked list isn't RCU-safe.
4247 */
4248static int cgroup_has_css_refs(struct cgroup *cgrp) 3891static int cgroup_has_css_refs(struct cgroup *cgrp)
4249{ 3892{
3893 /* Check the reference count on each subsystem. Since we
3894 * already established that there are no tasks in the
3895 * cgroup, if the css refcount is also 1, then there should
3896 * be no outstanding references, so the subsystem is safe to
3897 * destroy. We scan across all subsystems rather than using
3898 * the per-hierarchy linked list of mounted subsystems since
3899 * we can be called via check_for_release() with no
3900 * synchronization other than RCU, and the subsystem linked
3901 * list isn't RCU-safe */
4250 int i; 3902 int i;
4251
4252 /* 3903 /*
4253 * We won't need to lock the subsys array, because the subsystems 3904 * We won't need to lock the subsys array, because the subsystems
4254 * we're concerned about aren't going anywhere since our cgroup root 3905 * we're concerned about aren't going anywhere since our cgroup root
@@ -4257,130 +3908,193 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4257 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3908 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4258 struct cgroup_subsys *ss = subsys[i]; 3909 struct cgroup_subsys *ss = subsys[i];
4259 struct cgroup_subsys_state *css; 3910 struct cgroup_subsys_state *css;
4260
4261 /* Skip subsystems not present or not in this hierarchy */ 3911 /* Skip subsystems not present or not in this hierarchy */
4262 if (ss == NULL || ss->root != cgrp->root) 3912 if (ss == NULL || ss->root != cgrp->root)
4263 continue; 3913 continue;
4264
4265 css = cgrp->subsys[ss->subsys_id]; 3914 css = cgrp->subsys[ss->subsys_id];
4266 /* 3915 /* When called from check_for_release() it's possible
4267 * When called from check_for_release() it's possible
4268 * that by this point the cgroup has been removed 3916 * that by this point the cgroup has been removed
4269 * and the css deleted. But a false-positive doesn't 3917 * and the css deleted. But a false-positive doesn't
4270 * matter, since it can only happen if the cgroup 3918 * matter, since it can only happen if the cgroup
4271 * has been deleted and hence no longer needs the 3919 * has been deleted and hence no longer needs the
4272 * release agent to be called anyway. 3920 * release agent to be called anyway. */
4273 */ 3921 if (css && (atomic_read(&css->refcnt) > 1))
4274 if (css && css_refcnt(css) > 1)
4275 return 1; 3922 return 1;
4276 } 3923 }
4277 return 0; 3924 return 0;
4278} 3925}
4279 3926
4280static int cgroup_destroy_locked(struct cgroup *cgrp) 3927/*
4281 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3928 * Atomically mark all (or else none) of the cgroup's CSS objects as
3929 * CSS_REMOVED. Return true on success, or false if the cgroup has
3930 * busy subsystems. Call with cgroup_mutex held
3931 */
3932
3933static int cgroup_clear_css_refs(struct cgroup *cgrp)
4282{ 3934{
4283 struct dentry *d = cgrp->dentry;
4284 struct cgroup *parent = cgrp->parent;
4285 DEFINE_WAIT(wait);
4286 struct cgroup_event *event, *tmp;
4287 struct cgroup_subsys *ss; 3935 struct cgroup_subsys *ss;
4288 LIST_HEAD(tmp_list); 3936 unsigned long flags;
3937 bool failed = false;
3938 local_irq_save(flags);
3939 for_each_subsys(cgrp->root, ss) {
3940 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3941 int refcnt;
3942 while (1) {
3943 /* We can only remove a CSS with a refcnt==1 */
3944 refcnt = atomic_read(&css->refcnt);
3945 if (refcnt > 1) {
3946 failed = true;
3947 goto done;
3948 }
3949 BUG_ON(!refcnt);
3950 /*
3951 * Drop the refcnt to 0 while we check other
3952 * subsystems. This will cause any racing
3953 * css_tryget() to spin until we set the
3954 * CSS_REMOVED bits or abort
3955 */
3956 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3957 break;
3958 cpu_relax();
3959 }
3960 }
3961 done:
3962 for_each_subsys(cgrp->root, ss) {
3963 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3964 if (failed) {
3965 /*
3966 * Restore old refcnt if we previously managed
3967 * to clear it from 1 to 0
3968 */
3969 if (!atomic_read(&css->refcnt))
3970 atomic_set(&css->refcnt, 1);
3971 } else {
3972 /* Commit the fact that the CSS is removed */
3973 set_bit(CSS_REMOVED, &css->flags);
3974 }
3975 }
3976 local_irq_restore(flags);
3977 return !failed;
3978}
4289 3979
4290 lockdep_assert_held(&d->d_inode->i_mutex); 3980/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
4291 lockdep_assert_held(&cgroup_mutex); 3981 * Must be called with css_set_lock held */
3982static int cgroup_css_sets_empty(struct cgroup *cgrp)
3983{
3984 struct cg_cgroup_link *link;
4292 3985
4293 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 3986 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
4294 return -EBUSY; 3987 struct css_set *cg = link->cg;
3988 if (atomic_read(&cg->refcount) > 0)
3989 return 0;
3990 }
4295 3991
4296 /* 3992 return 1;
4297 * Block new css_tryget() by deactivating refcnt and mark @cgrp 3993}
4298 * removed. This makes future css_tryget() and child creation 3994
4299 * attempts fail thus maintaining the removal conditions verified 3995static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4300 * above. 3996{
4301 */ 3997 struct cgroup *cgrp = dentry->d_fsdata;
4302 for_each_subsys(cgrp->root, ss) { 3998 struct dentry *d;
4303 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3999 struct cgroup *parent;
4000 DEFINE_WAIT(wait);
4001 struct cgroup_event *event, *tmp;
4002 int ret;
4304 4003
4305 WARN_ON(atomic_read(&css->refcnt) < 0); 4004 /* the vfs holds both inode->i_mutex already */
4306 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4005again:
4006 mutex_lock(&cgroup_mutex);
4007 if (!cgroup_css_sets_empty(cgrp)) {
4008 mutex_unlock(&cgroup_mutex);
4009 return -EBUSY;
4307 } 4010 }
4308 set_bit(CGRP_REMOVED, &cgrp->flags); 4011 if (!list_empty(&cgrp->children)) {
4012 mutex_unlock(&cgroup_mutex);
4013 return -EBUSY;
4014 }
4015 mutex_unlock(&cgroup_mutex);
4309 4016
4310 /* tell subsystems to initate destruction */ 4017 /*
4311 for_each_subsys(cgrp->root, ss) 4018 * In general, subsystem has no css->refcnt after pre_destroy(). But
4312 offline_css(ss, cgrp); 4019 * in racy cases, subsystem may have to get css->refcnt after
4020 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4021 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4022 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4023 * and subsystem's reference count handling. Please see css_get/put
4024 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4025 */
4026 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4313 4027
4314 /* 4028 /*
4315 * Put all the base refs. Each css holds an extra reference to the 4029 * Call pre_destroy handlers of subsys. Notify subsystems
4316 * cgroup's dentry and cgroup removal proceeds regardless of css 4030 * that rmdir() request comes.
4317 * refs. On the last put of each css, whenever that may be, the
4318 * extra dentry ref is put so that dentry destruction happens only
4319 * after all css's are released.
4320 */ 4031 */
4321 for_each_subsys(cgrp->root, ss) 4032 ret = cgroup_call_pre_destroy(cgrp);
4322 css_put(cgrp->subsys[ss->subsys_id]); 4033 if (ret) {
4034 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4035 return ret;
4036 }
4037
4038 mutex_lock(&cgroup_mutex);
4039 parent = cgrp->parent;
4040 if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
4041 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4042 mutex_unlock(&cgroup_mutex);
4043 return -EBUSY;
4044 }
4045 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4046 if (!cgroup_clear_css_refs(cgrp)) {
4047 mutex_unlock(&cgroup_mutex);
4048 /*
4049 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4050 * prepare_to_wait(), we need to check this flag.
4051 */
4052 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4053 schedule();
4054 finish_wait(&cgroup_rmdir_waitq, &wait);
4055 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4056 if (signal_pending(current))
4057 return -EINTR;
4058 goto again;
4059 }
4060 /* NO css_tryget() can success after here. */
4061 finish_wait(&cgroup_rmdir_waitq, &wait);
4062 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4323 4063
4324 raw_spin_lock(&release_list_lock); 4064 spin_lock(&release_list_lock);
4065 set_bit(CGRP_REMOVED, &cgrp->flags);
4325 if (!list_empty(&cgrp->release_list)) 4066 if (!list_empty(&cgrp->release_list))
4326 list_del_init(&cgrp->release_list); 4067 list_del_init(&cgrp->release_list);
4327 raw_spin_unlock(&release_list_lock); 4068 spin_unlock(&release_list_lock);
4328 4069
4070 cgroup_lock_hierarchy(cgrp->root);
4329 /* delete this cgroup from parent->children */ 4071 /* delete this cgroup from parent->children */
4330 list_del_rcu(&cgrp->sibling); 4072 list_del_init(&cgrp->sibling);
4331 list_del_init(&cgrp->allcg_node); 4073 cgroup_unlock_hierarchy(cgrp->root);
4074
4075 d = dget(cgrp->dentry);
4332 4076
4333 dget(d);
4334 cgroup_d_remove_dir(d); 4077 cgroup_d_remove_dir(d);
4335 dput(d); 4078 dput(d);
4336 4079
4337 set_bit(CGRP_RELEASABLE, &parent->flags);
4338 check_for_release(parent); 4080 check_for_release(parent);
4339 4081
4340 /* 4082 /*
4341 * Unregister events and notify userspace. 4083 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4084 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4085 * directory to avoid race between userspace and kernelspace
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4086 */
4348 spin_lock(&cgrp->event_list_lock); 4087 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4088 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock); 4089 list_del(&event->list);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait); 4090 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1); 4091 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4092 schedule_work(&event->remove);
4356 } 4093 }
4094 spin_unlock(&cgrp->event_list_lock);
4357 4095
4358 return 0;
4359}
4360
4361static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4362{
4363 int ret;
4364
4365 mutex_lock(&cgroup_mutex);
4366 ret = cgroup_destroy_locked(dentry->d_fsdata);
4367 mutex_unlock(&cgroup_mutex); 4096 mutex_unlock(&cgroup_mutex);
4368 4097 return 0;
4369 return ret;
4370}
4371
4372static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4373{
4374 INIT_LIST_HEAD(&ss->cftsets);
4375
4376 /*
4377 * base_cftset is embedded in subsys itself, no need to worry about
4378 * deregistration.
4379 */
4380 if (ss->base_cftypes) {
4381 ss->base_cftset.cfts = ss->base_cftypes;
4382 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4383 }
4384} 4098}
4385 4099
4386static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4100static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
@@ -4389,15 +4103,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4389 4103
4390 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4104 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4391 4105
4392 mutex_lock(&cgroup_mutex);
4393
4394 /* init base cftset */
4395 cgroup_init_cftsets(ss);
4396
4397 /* Create the top cgroup state for this subsystem */ 4106 /* Create the top cgroup state for this subsystem */
4398 list_add(&ss->sibling, &rootnode.subsys_list); 4107 list_add(&ss->sibling, &rootnode.subsys_list);
4399 ss->root = &rootnode; 4108 ss->root = &rootnode;
4400 css = ss->css_alloc(dummytop); 4109 css = ss->create(ss, dummytop);
4401 /* We don't handle early failures gracefully */ 4110 /* We don't handle early failures gracefully */
4402 BUG_ON(IS_ERR(css)); 4111 BUG_ON(IS_ERR(css));
4403 init_cgroup_css(css, ss, dummytop); 4112 init_cgroup_css(css, ss, dummytop);
@@ -4406,7 +4115,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4406 * pointer to this state - since the subsystem is 4115 * pointer to this state - since the subsystem is
4407 * newly registered, all tasks and hence the 4116 * newly registered, all tasks and hence the
4408 * init_css_set is in the subsystem's top cgroup. */ 4117 * init_css_set is in the subsystem's top cgroup. */
4409 init_css_set.subsys[ss->subsys_id] = css; 4118 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
4410 4119
4411 need_forkexit_callback |= ss->fork || ss->exit; 4120 need_forkexit_callback |= ss->fork || ss->exit;
4412 4121
@@ -4415,10 +4124,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4415 * need to invoke fork callbacks here. */ 4124 * need to invoke fork callbacks here. */
4416 BUG_ON(!list_empty(&init_task.tasks)); 4125 BUG_ON(!list_empty(&init_task.tasks));
4417 4126
4127 mutex_init(&ss->hierarchy_mutex);
4128 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4418 ss->active = 1; 4129 ss->active = 1;
4419 BUG_ON(online_css(ss, dummytop));
4420
4421 mutex_unlock(&cgroup_mutex);
4422 4130
4423 /* this function shouldn't be used with modular subsystems, since they 4131 /* this function shouldn't be used with modular subsystems, since they
4424 * need to register a subsys_id, among other things */ 4132 * need to register a subsys_id, among other things */
@@ -4436,12 +4144,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4436 */ 4144 */
4437int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4145int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4146{
4147 int i;
4439 struct cgroup_subsys_state *css; 4148 struct cgroup_subsys_state *css;
4440 int i, ret;
4441 4149
4442 /* check name and function validity */ 4150 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4151 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4444 ss->css_alloc == NULL || ss->css_free == NULL) 4152 ss->create == NULL || ss->destroy == NULL)
4445 return -EINVAL; 4153 return -EINVAL;
4446 4154
4447 /* 4155 /*
@@ -4458,26 +4166,39 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4458 * since cgroup_init_subsys will have already taken care of it. 4166 * since cgroup_init_subsys will have already taken care of it.
4459 */ 4167 */
4460 if (ss->module == NULL) { 4168 if (ss->module == NULL) {
4461 /* a sanity check */ 4169 /* a few sanity checks */
4170 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4462 BUG_ON(subsys[ss->subsys_id] != ss); 4171 BUG_ON(subsys[ss->subsys_id] != ss);
4463 return 0; 4172 return 0;
4464 } 4173 }
4465 4174
4466 /* init base cftset */ 4175 /*
4467 cgroup_init_cftsets(ss); 4176 * need to register a subsys id before anything else - for example,
4468 4177 * init_cgroup_css needs it.
4178 */
4469 mutex_lock(&cgroup_mutex); 4179 mutex_lock(&cgroup_mutex);
4470 subsys[ss->subsys_id] = ss; 4180 /* find the first empty slot in the array */
4181 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4182 if (subsys[i] == NULL)
4183 break;
4184 }
4185 if (i == CGROUP_SUBSYS_COUNT) {
4186 /* maximum number of subsystems already registered! */
4187 mutex_unlock(&cgroup_mutex);
4188 return -EBUSY;
4189 }
4190 /* assign ourselves the subsys_id */
4191 ss->subsys_id = i;
4192 subsys[i] = ss;
4471 4193
4472 /* 4194 /*
4473 * no ss->css_alloc seems to need anything important in the ss 4195 * no ss->create seems to need anything important in the ss struct, so
4474 * struct, so this can happen first (i.e. before the rootnode 4196 * this can happen first (i.e. before the rootnode attachment).
4475 * attachment).
4476 */ 4197 */
4477 css = ss->css_alloc(dummytop); 4198 css = ss->create(ss, dummytop);
4478 if (IS_ERR(css)) { 4199 if (IS_ERR(css)) {
4479 /* failure case - need to deassign the subsys[] slot. */ 4200 /* failure case - need to deassign the subsys[] slot. */
4480 subsys[ss->subsys_id] = NULL; 4201 subsys[i] = NULL;
4481 mutex_unlock(&cgroup_mutex); 4202 mutex_unlock(&cgroup_mutex);
4482 return PTR_ERR(css); 4203 return PTR_ERR(css);
4483 } 4204 }
@@ -4489,9 +4210,14 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4489 init_cgroup_css(css, ss, dummytop); 4210 init_cgroup_css(css, ss, dummytop);
4490 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4211 /* init_idr must be after init_cgroup_css because it sets css->id. */
4491 if (ss->use_id) { 4212 if (ss->use_id) {
4492 ret = cgroup_init_idr(ss, css); 4213 int ret = cgroup_init_idr(ss, css);
4493 if (ret) 4214 if (ret) {
4494 goto err_unload; 4215 dummytop->subsys[ss->subsys_id] = NULL;
4216 ss->destroy(ss, dummytop);
4217 subsys[i] = NULL;
4218 mutex_unlock(&cgroup_mutex);
4219 return ret;
4220 }
4495 } 4221 }
4496 4222
4497 /* 4223 /*
@@ -4523,20 +4249,13 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4523 } 4249 }
4524 write_unlock(&css_set_lock); 4250 write_unlock(&css_set_lock);
4525 4251
4252 mutex_init(&ss->hierarchy_mutex);
4253 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4526 ss->active = 1; 4254 ss->active = 1;
4527 ret = online_css(ss, dummytop);
4528 if (ret)
4529 goto err_unload;
4530 4255
4531 /* success! */ 4256 /* success! */
4532 mutex_unlock(&cgroup_mutex); 4257 mutex_unlock(&cgroup_mutex);
4533 return 0; 4258 return 0;
4534
4535err_unload:
4536 mutex_unlock(&cgroup_mutex);
4537 /* @ss can't be mounted here as try_module_get() would fail */
4538 cgroup_unload_subsys(ss);
4539 return ret;
4540} 4259}
4541EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4260EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4542 4261
@@ -4563,16 +4282,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4563 BUG_ON(ss->root != &rootnode); 4282 BUG_ON(ss->root != &rootnode);
4564 4283
4565 mutex_lock(&cgroup_mutex); 4284 mutex_lock(&cgroup_mutex);
4566
4567 offline_css(ss, dummytop);
4568 ss->active = 0;
4569
4570 if (ss->use_id) {
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr);
4573 }
4574
4575 /* deassign the subsys_id */ 4285 /* deassign the subsys_id */
4286 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4576 subsys[ss->subsys_id] = NULL; 4287 subsys[ss->subsys_id] = NULL;
4577 4288
4578 /* remove subsystem from rootnode's list of subsystems */ 4289 /* remove subsystem from rootnode's list of subsystems */
@@ -4587,6 +4298,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4587 struct css_set *cg = link->cg; 4298 struct css_set *cg = link->cg;
4588 4299
4589 hlist_del(&cg->hlist); 4300 hlist_del(&cg->hlist);
4301 BUG_ON(!cg->subsys[ss->subsys_id]);
4590 cg->subsys[ss->subsys_id] = NULL; 4302 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4303 hhead = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4304 hlist_add_head(&cg->hlist, hhead);
@@ -4594,12 +4306,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4594 write_unlock(&css_set_lock); 4306 write_unlock(&css_set_lock);
4595 4307
4596 /* 4308 /*
4597 * remove subsystem's css from the dummytop and free it - need to 4309 * remove subsystem's css from the dummytop and free it - need to free
4598 * free before marking as null because ss->css_free needs the 4310 * before marking as null because ss->destroy needs the cgrp->subsys
4599 * cgrp->subsys pointer to find their state. note that this also 4311 * pointer to find their state. note that this also takes care of
4600 * takes care of freeing the css_id. 4312 * freeing the css_id.
4601 */ 4313 */
4602 ss->css_free(dummytop); 4314 ss->destroy(ss, dummytop);
4603 dummytop->subsys[ss->subsys_id] = NULL; 4315 dummytop->subsys[ss->subsys_id] = NULL;
4604 4316
4605 mutex_unlock(&cgroup_mutex); 4317 mutex_unlock(&cgroup_mutex);
@@ -4634,17 +4346,14 @@ int __init cgroup_init_early(void)
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 4346 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]); 4347 INIT_HLIST_HEAD(&css_set_table[i]);
4636 4348
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4349 /* at bootup time, we don't worry about modular subsystems */
4350 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4351 struct cgroup_subsys *ss = subsys[i];
4639 4352
4640 /* at bootup time, we don't worry about modular subsystems */
4641 if (!ss || ss->module)
4642 continue;
4643
4644 BUG_ON(!ss->name); 4353 BUG_ON(!ss->name);
4645 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4354 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4646 BUG_ON(!ss->css_alloc); 4355 BUG_ON(!ss->create);
4647 BUG_ON(!ss->css_free); 4356 BUG_ON(!ss->destroy);
4648 if (ss->subsys_id != i) { 4357 if (ss->subsys_id != i) {
4649 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4358 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4650 ss->name, ss->subsys_id); 4359 ss->name, ss->subsys_id);
@@ -4673,12 +4382,9 @@ int __init cgroup_init(void)
4673 if (err) 4382 if (err)
4674 return err; 4383 return err;
4675 4384
4676 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4385 /* at bootup time, we don't worry about modular subsystems */
4386 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4677 struct cgroup_subsys *ss = subsys[i]; 4387 struct cgroup_subsys *ss = subsys[i];
4678
4679 /* at bootup time, we don't worry about modular subsystems */
4680 if (!ss || ss->module)
4681 continue;
4682 if (!ss->early_init) 4388 if (!ss->early_init)
4683 cgroup_init_subsys(ss); 4389 cgroup_init_subsys(ss);
4684 if (ss->use_id) 4390 if (ss->use_id)
@@ -4851,30 +4557,41 @@ void cgroup_fork(struct task_struct *child)
4851} 4557}
4852 4558
4853/** 4559/**
4560 * cgroup_fork_callbacks - run fork callbacks
4561 * @child: the new task
4562 *
4563 * Called on a new task very soon before adding it to the
4564 * tasklist. No need to take any locks since no-one can
4565 * be operating on this task.
4566 */
4567void cgroup_fork_callbacks(struct task_struct *child)
4568{
4569 if (need_forkexit_callback) {
4570 int i;
4571 /*
4572 * forkexit callbacks are only supported for builtin
4573 * subsystems, and the builtin section of the subsys array is
4574 * immutable, so we don't need to lock the subsys array here.
4575 */
4576 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4577 struct cgroup_subsys *ss = subsys[i];
4578 if (ss->fork)
4579 ss->fork(ss, child);
4580 }
4581 }
4582}
4583
4584/**
4854 * cgroup_post_fork - called on a new task after adding it to the task list 4585 * cgroup_post_fork - called on a new task after adding it to the task list
4855 * @child: the task in question 4586 * @child: the task in question
4856 * 4587 *
4857 * Adds the task to the list running through its css_set if necessary and 4588 * Adds the task to the list running through its css_set if necessary.
4858 * call the subsystem fork() callbacks. Has to be after the task is 4589 * Has to be after the task is visible on the task list in case we race
4859 * visible on the task list in case we race with the first call to 4590 * with the first call to cgroup_iter_start() - to guarantee that the
4860 * cgroup_iter_start() - to guarantee that the new task ends up on its 4591 * new task ends up on its list.
4861 * list.
4862 */ 4592 */
4863void cgroup_post_fork(struct task_struct *child) 4593void cgroup_post_fork(struct task_struct *child)
4864{ 4594{
4865 int i;
4866
4867 /*
4868 * use_task_css_set_links is set to 1 before we walk the tasklist
4869 * under the tasklist_lock and we read it here after we added the child
4870 * to the tasklist under the tasklist_lock as well. If the child wasn't
4871 * yet in the tasklist when we walked through it from
4872 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4873 * should be visible now due to the paired locking and barriers implied
4874 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4875 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4876 * lock on fork.
4877 */
4878 if (use_task_css_set_links) { 4595 if (use_task_css_set_links) {
4879 write_lock(&css_set_lock); 4596 write_lock(&css_set_lock);
4880 task_lock(child); 4597 task_lock(child);
@@ -4883,30 +4600,7 @@ void cgroup_post_fork(struct task_struct *child)
4883 task_unlock(child); 4600 task_unlock(child);
4884 write_unlock(&css_set_lock); 4601 write_unlock(&css_set_lock);
4885 } 4602 }
4886
4887 /*
4888 * Call ss->fork(). This must happen after @child is linked on
4889 * css_set; otherwise, @child might change state between ->fork()
4890 * and addition to css_set.
4891 */
4892 if (need_forkexit_callback) {
4893 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4894 struct cgroup_subsys *ss = subsys[i];
4895
4896 /*
4897 * fork/exit callbacks are supported only for
4898 * builtin subsystems and we don't need further
4899 * synchronization as they never go away.
4900 */
4901 if (!ss || ss->module)
4902 continue;
4903
4904 if (ss->fork)
4905 ss->fork(child);
4906 }
4907 }
4908} 4603}
4909
4910/** 4604/**
4911 * cgroup_exit - detach cgroup from exiting task 4605 * cgroup_exit - detach cgroup from exiting task
4912 * @tsk: pointer to task_struct of exiting process 4606 * @tsk: pointer to task_struct of exiting process
@@ -4965,25 +4659,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4965 tsk->cgroups = &init_css_set; 4659 tsk->cgroups = &init_css_set;
4966 4660
4967 if (run_callbacks && need_forkexit_callback) { 4661 if (run_callbacks && need_forkexit_callback) {
4968 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4662 /*
4663 * modular subsystems can't use callbacks, so no need to lock
4664 * the subsys array
4665 */
4666 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4969 struct cgroup_subsys *ss = subsys[i]; 4667 struct cgroup_subsys *ss = subsys[i];
4970
4971 /* modular subsystems can't use callbacks */
4972 if (!ss || ss->module)
4973 continue;
4974
4975 if (ss->exit) { 4668 if (ss->exit) {
4976 struct cgroup *old_cgrp = 4669 struct cgroup *old_cgrp =
4977 rcu_dereference_raw(cg->subsys[i])->cgroup; 4670 rcu_dereference_raw(cg->subsys[i])->cgroup;
4978 struct cgroup *cgrp = task_cgroup(tsk, i); 4671 struct cgroup *cgrp = task_cgroup(tsk, i);
4979 ss->exit(cgrp, old_cgrp, tsk); 4672 ss->exit(ss, cgrp, old_cgrp, tsk);
4980 } 4673 }
4981 } 4674 }
4982 } 4675 }
4983 task_unlock(tsk); 4676 task_unlock(tsk);
4984 4677
4985 if (cg) 4678 if (cg)
4986 put_css_set_taskexit(cg); 4679 put_css_set(cg);
4987} 4680}
4988 4681
4989/** 4682/**
@@ -5024,56 +4717,39 @@ static void check_for_release(struct cgroup *cgrp)
5024 * already queued for a userspace notification, queue 4717 * already queued for a userspace notification, queue
5025 * it now */ 4718 * it now */
5026 int need_schedule_work = 0; 4719 int need_schedule_work = 0;
5027 raw_spin_lock(&release_list_lock); 4720 spin_lock(&release_list_lock);
5028 if (!cgroup_is_removed(cgrp) && 4721 if (!cgroup_is_removed(cgrp) &&
5029 list_empty(&cgrp->release_list)) { 4722 list_empty(&cgrp->release_list)) {
5030 list_add(&cgrp->release_list, &release_list); 4723 list_add(&cgrp->release_list, &release_list);
5031 need_schedule_work = 1; 4724 need_schedule_work = 1;
5032 } 4725 }
5033 raw_spin_unlock(&release_list_lock); 4726 spin_unlock(&release_list_lock);
5034 if (need_schedule_work) 4727 if (need_schedule_work)
5035 schedule_work(&release_agent_work); 4728 schedule_work(&release_agent_work);
5036 } 4729 }
5037} 4730}
5038 4731
5039/* Caller must verify that the css is not for root cgroup */ 4732/* Caller must verify that the css is not for root cgroup */
5040bool __css_tryget(struct cgroup_subsys_state *css) 4733void __css_get(struct cgroup_subsys_state *css, int count)
5041{ 4734{
5042 while (true) { 4735 atomic_add(count, &css->refcnt);
5043 int t, v; 4736 set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5044
5045 v = css_refcnt(css);
5046 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5047 if (likely(t == v))
5048 return true;
5049 else if (t < 0)
5050 return false;
5051 cpu_relax();
5052 }
5053} 4737}
5054EXPORT_SYMBOL_GPL(__css_tryget); 4738EXPORT_SYMBOL_GPL(__css_get);
5055 4739
5056/* Caller must verify that the css is not for root cgroup */ 4740/* Caller must verify that the css is not for root cgroup */
5057void __css_put(struct cgroup_subsys_state *css) 4741void __css_put(struct cgroup_subsys_state *css, int count)
5058{ 4742{
5059 struct cgroup *cgrp = css->cgroup; 4743 struct cgroup *cgrp = css->cgroup;
5060 int v; 4744 int val;
5061
5062 rcu_read_lock(); 4745 rcu_read_lock();
5063 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4746 val = atomic_sub_return(count, &css->refcnt);
5064 4747 if (val == 1) {
5065 switch (v) { 4748 check_for_release(cgrp);
5066 case 1: 4749 cgroup_wakeup_rmdir_waiter(cgrp);
5067 if (notify_on_release(cgrp)) {
5068 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5069 check_for_release(cgrp);
5070 }
5071 break;
5072 case 0:
5073 schedule_work(&css->dput_work);
5074 break;
5075 } 4750 }
5076 rcu_read_unlock(); 4751 rcu_read_unlock();
4752 WARN_ON_ONCE(val < 1);
5077} 4753}
5078EXPORT_SYMBOL_GPL(__css_put); 4754EXPORT_SYMBOL_GPL(__css_put);
5079 4755
@@ -5104,7 +4780,7 @@ static void cgroup_release_agent(struct work_struct *work)
5104{ 4780{
5105 BUG_ON(work != &release_agent_work); 4781 BUG_ON(work != &release_agent_work);
5106 mutex_lock(&cgroup_mutex); 4782 mutex_lock(&cgroup_mutex);
5107 raw_spin_lock(&release_list_lock); 4783 spin_lock(&release_list_lock);
5108 while (!list_empty(&release_list)) { 4784 while (!list_empty(&release_list)) {
5109 char *argv[3], *envp[3]; 4785 char *argv[3], *envp[3];
5110 int i; 4786 int i;
@@ -5113,7 +4789,7 @@ static void cgroup_release_agent(struct work_struct *work)
5113 struct cgroup, 4789 struct cgroup,
5114 release_list); 4790 release_list);
5115 list_del_init(&cgrp->release_list); 4791 list_del_init(&cgrp->release_list);
5116 raw_spin_unlock(&release_list_lock); 4792 spin_unlock(&release_list_lock);
5117 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4793 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5118 if (!pathbuf) 4794 if (!pathbuf)
5119 goto continue_free; 4795 goto continue_free;
@@ -5143,9 +4819,9 @@ static void cgroup_release_agent(struct work_struct *work)
5143 continue_free: 4819 continue_free:
5144 kfree(pathbuf); 4820 kfree(pathbuf);
5145 kfree(agentbuf); 4821 kfree(agentbuf);
5146 raw_spin_lock(&release_list_lock); 4822 spin_lock(&release_list_lock);
5147 } 4823 }
5148 raw_spin_unlock(&release_list_lock); 4824 spin_unlock(&release_list_lock);
5149 mutex_unlock(&cgroup_mutex); 4825 mutex_unlock(&cgroup_mutex);
5150} 4826}
5151 4827
@@ -5157,17 +4833,13 @@ static int __init cgroup_disable(char *str)
5157 while ((token = strsep(&str, ",")) != NULL) { 4833 while ((token = strsep(&str, ",")) != NULL) {
5158 if (!*token) 4834 if (!*token)
5159 continue; 4835 continue;
5160 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4836 /*
4837 * cgroup_disable, being at boot time, can't know about module
4838 * subsystems, so we don't worry about them.
4839 */
4840 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5161 struct cgroup_subsys *ss = subsys[i]; 4841 struct cgroup_subsys *ss = subsys[i];
5162 4842
5163 /*
5164 * cgroup_disable, being at boot time, can't
5165 * know about module subsystems, so we don't
5166 * worry about them.
5167 */
5168 if (!ss || ss->module)
5169 continue;
5170
5171 if (!strcmp(token, ss->name)) { 4843 if (!strcmp(token, ss->name)) {
5172 ss->disabled = 1; 4844 ss->disabled = 1;
5173 printk(KERN_INFO "Disabling %s control group" 4845 printk(KERN_INFO "Disabling %s control group"
@@ -5196,7 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5196 * on this or this is under rcu_read_lock(). Once css->id is allocated, 4868 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5197 * it's unchanged until freed. 4869 * it's unchanged until freed.
5198 */ 4870 */
5199 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 4871 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
5200 4872
5201 if (cssid) 4873 if (cssid)
5202 return cssid->id; 4874 return cssid->id;
@@ -5208,7 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
5208{ 4880{
5209 struct css_id *cssid; 4881 struct css_id *cssid;
5210 4882
5211 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 4883 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
5212 4884
5213 if (cssid) 4885 if (cssid)
5214 return cssid->depth; 4886 return cssid->depth;
@@ -5222,7 +4894,7 @@ EXPORT_SYMBOL_GPL(css_depth);
5222 * @root: the css supporsed to be an ancestor of the child. 4894 * @root: the css supporsed to be an ancestor of the child.
5223 * 4895 *
5224 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 4896 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
5225 * this function reads css->id, the caller must hold rcu_read_lock(). 4897 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
5226 * But, considering usual usage, the csses should be valid objects after test. 4898 * But, considering usual usage, the csses should be valid objects after test.
5227 * Assuming that the caller will do some action to the child if this returns 4899 * Assuming that the caller will do some action to the child if this returns
5228 * returns true, the caller must take "child";s reference count. 4900 * returns true, the caller must take "child";s reference count.
@@ -5234,18 +4906,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5234{ 4906{
5235 struct css_id *child_id; 4907 struct css_id *child_id;
5236 struct css_id *root_id; 4908 struct css_id *root_id;
4909 bool ret = true;
5237 4910
4911 rcu_read_lock();
5238 child_id = rcu_dereference(child->id); 4912 child_id = rcu_dereference(child->id);
5239 if (!child_id)
5240 return false;
5241 root_id = rcu_dereference(root->id); 4913 root_id = rcu_dereference(root->id);
5242 if (!root_id) 4914 if (!child_id
5243 return false; 4915 || !root_id
5244 if (child_id->depth < root_id->depth) 4916 || (child_id->depth < root_id->depth)
5245 return false; 4917 || (child_id->stack[root_id->depth] != root_id->id))
5246 if (child_id->stack[root_id->depth] != root_id->id) 4918 ret = false;
5247 return false; 4919 rcu_read_unlock();
5248 return true; 4920 return ret;
5249} 4921}
5250 4922
5251void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4923void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
@@ -5407,8 +5079,6 @@ css_get_next(struct cgroup_subsys *ss, int id,
5407 return NULL; 5079 return NULL;
5408 5080
5409 BUG_ON(!ss->use_id); 5081 BUG_ON(!ss->use_id);
5410 WARN_ON_ONCE(!rcu_read_lock_held());
5411
5412 /* fill start point for scan */ 5082 /* fill start point for scan */
5413 tmpid = id; 5083 tmpid = id;
5414 while (1) { 5084 while (1) {
@@ -5416,7 +5086,10 @@ css_get_next(struct cgroup_subsys *ss, int id,
5416 * scan next entry from bitmap(tree), tmpid is updated after 5086 * scan next entry from bitmap(tree), tmpid is updated after
5417 * idr_get_next(). 5087 * idr_get_next().
5418 */ 5088 */
5089 spin_lock(&ss->id_lock);
5419 tmp = idr_get_next(&ss->idr, &tmpid); 5090 tmp = idr_get_next(&ss->idr, &tmpid);
5091 spin_unlock(&ss->id_lock);
5092
5420 if (!tmp) 5093 if (!tmp)
5421 break; 5094 break;
5422 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5095 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5456,7 +5129,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5456} 5129}
5457 5130
5458#ifdef CONFIG_CGROUP_DEBUG 5131#ifdef CONFIG_CGROUP_DEBUG
5459static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5132static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5133 struct cgroup *cont)
5460{ 5134{
5461 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5135 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5462 5136
@@ -5466,7 +5140,7 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5466 return css; 5140 return css;
5467} 5141}
5468 5142
5469static void debug_css_free(struct cgroup *cont) 5143static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
5470{ 5144{
5471 kfree(cont->subsys[debug_subsys_id]); 5145 kfree(cont->subsys[debug_subsys_id]);
5472} 5146}
@@ -5589,15 +5263,19 @@ static struct cftype debug_files[] = {
5589 .name = "releasable", 5263 .name = "releasable",
5590 .read_u64 = releasable_read, 5264 .read_u64 = releasable_read,
5591 }, 5265 },
5592
5593 { } /* terminate */
5594}; 5266};
5595 5267
5268static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
5269{
5270 return cgroup_add_files(cont, ss, debug_files,
5271 ARRAY_SIZE(debug_files));
5272}
5273
5596struct cgroup_subsys debug_subsys = { 5274struct cgroup_subsys debug_subsys = {
5597 .name = "debug", 5275 .name = "debug",
5598 .css_alloc = debug_css_alloc, 5276 .create = debug_create,
5599 .css_free = debug_css_free, 5277 .destroy = debug_destroy,
5278 .populate = debug_populate,
5600 .subsys_id = debug_subsys_id, 5279 .subsys_id = debug_subsys_id,
5601 .base_cftypes = debug_files,
5602}; 5280};
5603#endif /* CONFIG_CGROUP_DEBUG */ 5281#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea502..a3f638ac3de 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */ 15 */
16 16
17#include <linux/export.h> 17#include <linux/module.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/cgroup.h> 19#include <linux/cgroup.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
@@ -22,33 +22,24 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25/* 25enum freezer_state {
26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is 26 CGROUP_THAWED = 0,
27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared 27 CGROUP_FREEZING,
28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING 28 CGROUP_FROZEN,
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
40}; 29};
41 30
42struct freezer { 31struct freezer {
43 struct cgroup_subsys_state css; 32 struct cgroup_subsys_state css;
44 unsigned int state; 33 enum freezer_state state;
45 spinlock_t lock; 34 spinlock_t lock; /* protects _writes_ to state */
46}; 35};
47 36
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 37static inline struct freezer *cgroup_freezer(
38 struct cgroup *cgroup)
49{ 39{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 40 return container_of(
51 struct freezer, css); 41 cgroup_subsys_state(cgroup, freezer_subsys_id),
42 struct freezer, css);
52} 43}
53 44
54static inline struct freezer *task_freezer(struct task_struct *task) 45static inline struct freezer *task_freezer(struct task_struct *task)
@@ -57,42 +48,93 @@ static inline struct freezer *task_freezer(struct task_struct *task)
57 struct freezer, css); 48 struct freezer, css);
58} 49}
59 50
60static struct freezer *parent_freezer(struct freezer *freezer) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
61{ 52{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 53 enum freezer_state state = task_freezer(task)->state;
63 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 55}
68 56
69bool cgroup_freezing(struct task_struct *task) 57int cgroup_freezing_or_frozen(struct task_struct *task)
70{ 58{
71 bool ret; 59 int result;
72 60 task_lock(task);
73 rcu_read_lock(); 61 result = __cgroup_freezing_or_frozen(task);
74 ret = task_freezer(task)->state & CGROUP_FREEZING; 62 task_unlock(task);
75 rcu_read_unlock(); 63 return result;
76
77 return ret;
78} 64}
79 65
80/* 66/*
81 * cgroups_write_string() limits the size of freezer state strings to 67 * cgroups_write_string() limits the size of freezer state strings to
82 * CGROUP_LOCAL_BUFFER_SIZE 68 * CGROUP_LOCAL_BUFFER_SIZE
83 */ 69 */
84static const char *freezer_state_strs(unsigned int state) 70static const char *freezer_state_strs[] = {
85{ 71 "THAWED",
86 if (state & CGROUP_FROZEN) 72 "FREEZING",
87 return "FROZEN"; 73 "FROZEN",
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
91}; 74};
92 75
76/*
77 * State diagram
78 * Transitions are caused by userspace writes to the freezer.state file.
79 * The values in parenthesis are state labels. The rest are edge labels.
80 *
81 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
82 * ^ ^ | |
83 * | \_______THAWED_______/ |
84 * \__________________________THAWED____________/
85 */
86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89/* Locks taken and their ordering
90 * ------------------------------
91 * cgroup_mutex (AKA cgroup_lock)
92 * freezer->lock
93 * css_set_lock
94 * task->alloc_lock (AKA task_lock)
95 * task->sighand->siglock
96 *
97 * cgroup code forces css_set_lock to be taken before task->alloc_lock
98 *
99 * freezer_create(), freezer_destroy():
100 * cgroup_mutex [ by cgroup core ]
101 *
102 * freezer_can_attach():
103 * cgroup_mutex (held by caller of can_attach)
104 *
105 * cgroup_freezing_or_frozen():
106 * task->alloc_lock (to get task's cgroup)
107 *
108 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
109 * freezer->lock
110 * sighand->siglock (if the cgroup is freezing)
111 *
112 * freezer_read():
113 * cgroup_mutex
114 * freezer->lock
115 * write_lock css_set_lock (cgroup iterator start)
116 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start)
118 *
119 * freezer_write() (freeze):
120 * cgroup_mutex
121 * freezer->lock
122 * write_lock css_set_lock (cgroup iterator start)
123 * task->alloc_lock
124 * read_lock css_set_lock (cgroup iterator start)
125 * sighand->siglock (fake signal delivery inside freeze_task())
126 *
127 * freezer_write() (unfreeze):
128 * cgroup_mutex
129 * freezer->lock
130 * write_lock css_set_lock (cgroup iterator start)
131 * task->alloc_lock
132 * read_lock css_set_lock (cgroup iterator start)
133 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
134 * sighand->siglock
135 */
136static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
137 struct cgroup *cgroup)
96{ 138{
97 struct freezer *freezer; 139 struct freezer *freezer;
98 140
@@ -101,388 +143,255 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
101 return ERR_PTR(-ENOMEM); 143 return ERR_PTR(-ENOMEM);
102 144
103 spin_lock_init(&freezer->lock); 145 spin_lock_init(&freezer->lock);
146 freezer->state = CGROUP_THAWED;
104 return &freezer->css; 147 return &freezer->css;
105} 148}
106 149
107/** 150static void freezer_destroy(struct cgroup_subsys *ss,
108 * freezer_css_online - commit creation of a freezer cgroup 151 struct cgroup *cgroup)
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{ 152{
117 struct freezer *freezer = cgroup_freezer(cgroup); 153 kfree(cgroup_freezer(cgroup));
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
151{
152 struct freezer *freezer = cgroup_freezer(cgroup);
153
154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
157 atomic_dec(&system_freezing_cnt);
158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
162} 154}
163 155
164static void freezer_css_free(struct cgroup *cgroup) 156/* task is frozen or will freeze immediately when next it gets woken */
157static bool is_task_frozen_enough(struct task_struct *task)
165{ 158{
166 kfree(cgroup_freezer(cgroup)); 159 return frozen(task) ||
160 (task_is_stopped_or_traced(task) && freezing(task));
167} 161}
168 162
169/* 163/*
170 * Tasks can be migrated into a different freezer anytime regardless of its 164 * The call to cgroup_lock() in the freezer.state write method prevents
171 * current state. freezer_attach() is responsible for making new tasks 165 * a write to that file racing against an attach, and hence the
172 * conform to the current state. 166 * can_attach() result will remain valid until the attach completes.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
177 */ 167 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 168static int freezer_can_attach(struct cgroup_subsys *ss,
169 struct cgroup *new_cgroup,
170 struct task_struct *task)
179{ 171{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 172 struct freezer *freezer;
181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
185 173
186 /* 174 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 175 * Anything frozen can't move or be moved to/from.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 176 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
205 177
206 spin_unlock_irq(&freezer->lock); 178 freezer = cgroup_freezer(new_cgroup);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
207 181
208 /* 182 return 0;
209 * Propagate FROZEN clearing upwards. We may race with 183}
210 * update_if_frozen(), but as long as both work bottom-up, either 184
211 * update_if_frozen() sees child's FROZEN cleared or we clear the 185static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
212 * parent's FROZEN later. No parent w/ !FROZEN children can be 186{
213 * left FROZEN. 187 rcu_read_lock();
214 */ 188 if (__cgroup_freezing_or_frozen(tsk)) {
215 while (clear_frozen && (freezer = parent_freezer(freezer))) { 189 rcu_read_unlock();
216 spin_lock_irq(&freezer->lock); 190 return -EBUSY;
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 } 191 }
192 rcu_read_unlock();
193 return 0;
221} 194}
222 195
223static void freezer_fork(struct task_struct *task) 196static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
224{ 197{
225 struct freezer *freezer; 198 struct freezer *freezer;
226 199
200 /*
201 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this
204 * function call. Nevertheless, apply RCU read-side critical
205 * section to suppress RCU lockdep false positives.
206 */
227 rcu_read_lock(); 207 rcu_read_lock();
228 freezer = task_freezer(task); 208 freezer = task_freezer(task);
209 rcu_read_unlock();
229 210
230 /* 211 /*
231 * The root cgroup is non-freezable, so we can skip the 212 * The root cgroup is non-freezable, so we can skip the
232 * following check. 213 * following check.
233 */ 214 */
234 if (!freezer->css.cgroup->parent) 215 if (!freezer->css.cgroup->parent)
235 goto out; 216 return;
236 217
237 spin_lock_irq(&freezer->lock); 218 spin_lock_irq(&freezer->lock);
238 if (freezer->state & CGROUP_FREEZING) 219 BUG_ON(freezer->state == CGROUP_FROZEN);
239 freeze_task(task); 220
221 /* Locking avoids race with FREEZING -> THAWED transitions. */
222 if (freezer->state == CGROUP_FREEZING)
223 freeze_task(task, true);
240 spin_unlock_irq(&freezer->lock); 224 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
243} 225}
244 226
245/** 227/*
246 * update_if_frozen - update whether a cgroup finished freezing 228 * caller must hold freezer->lock
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
260 */ 229 */
261static void update_if_frozen(struct cgroup *cgroup) 230static void update_if_frozen(struct cgroup *cgroup,
231 struct freezer *freezer)
262{ 232{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
265 struct cgroup_iter it; 233 struct cgroup_iter it;
266 struct task_struct *task; 234 struct task_struct *task;
235 unsigned int nfrozen = 0, ntotal = 0;
236 enum freezer_state old_state = freezer->state;
267 237
268 WARN_ON_ONCE(!rcu_read_lock_held());
269
270 spin_lock_irq(&freezer->lock);
271
272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
283 }
284
285 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 238 cgroup_iter_start(cgroup, &it);
287
288 while ((task = cgroup_iter_next(cgroup, &it))) { 239 while ((task = cgroup_iter_next(cgroup, &it))) {
289 if (freezing(task)) { 240 ntotal++;
290 /* 241 if (is_task_frozen_enough(task))
291 * freezer_should_skip() indicates that the task 242 nfrozen++;
292 * should be skipped when determining freezing 243 }
293 * completion. Consider it frozen in addition to 244
294 * the usual frozen condition. 245 if (old_state == CGROUP_THAWED) {
295 */ 246 BUG_ON(nfrozen > 0);
296 if (!frozen(task) && !freezer_should_skip(task)) 247 } else if (old_state == CGROUP_FREEZING) {
297 goto out_iter_end; 248 if (nfrozen == ntotal)
298 } 249 freezer->state = CGROUP_FROZEN;
250 } else { /* old_state == CGROUP_FROZEN */
251 BUG_ON(nfrozen != ntotal);
299 } 252 }
300 253
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
303 cgroup_iter_end(cgroup, &it); 254 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
306} 255}
307 256
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 257static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
309 struct seq_file *m) 258 struct seq_file *m)
310{ 259{
311 struct cgroup *pos; 260 struct freezer *freezer;
312 261 enum freezer_state state;
313 rcu_read_lock();
314 262
315 /* update states bottom-up */ 263 if (!cgroup_lock_live_group(cgroup))
316 cgroup_for_each_descendant_post(pos, cgroup) 264 return -ENODEV;
317 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 265
320 rcu_read_unlock(); 266 freezer = cgroup_freezer(cgroup);
267 spin_lock_irq(&freezer->lock);
268 state = freezer->state;
269 if (state == CGROUP_FREEZING) {
270 /* We change from FREEZING to FROZEN lazily if the cgroup was
271 * only partially frozen when we exitted write. */
272 update_if_frozen(cgroup, freezer);
273 state = freezer->state;
274 }
275 spin_unlock_irq(&freezer->lock);
276 cgroup_unlock();
321 277
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 278 seq_puts(m, freezer_state_strs[state]);
323 seq_putc(m, '\n'); 279 seq_putc(m, '\n');
324 return 0; 280 return 0;
325} 281}
326 282
327static void freeze_cgroup(struct freezer *freezer) 283static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
328{ 284{
329 struct cgroup *cgroup = freezer->css.cgroup;
330 struct cgroup_iter it; 285 struct cgroup_iter it;
331 struct task_struct *task; 286 struct task_struct *task;
287 unsigned int num_cant_freeze_now = 0;
332 288
289 freezer->state = CGROUP_FREEZING;
333 cgroup_iter_start(cgroup, &it); 290 cgroup_iter_start(cgroup, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 291 while ((task = cgroup_iter_next(cgroup, &it))) {
335 freeze_task(task); 292 if (!freeze_task(task, true))
293 continue;
294 if (is_task_frozen_enough(task))
295 continue;
296 if (!freezing(task) && !freezer_should_skip(task))
297 num_cant_freeze_now++;
298 }
336 cgroup_iter_end(cgroup, &it); 299 cgroup_iter_end(cgroup, &it);
300
301 return num_cant_freeze_now ? -EBUSY : 0;
337} 302}
338 303
339static void unfreeze_cgroup(struct freezer *freezer) 304static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
340{ 305{
341 struct cgroup *cgroup = freezer->css.cgroup;
342 struct cgroup_iter it; 306 struct cgroup_iter it;
343 struct task_struct *task; 307 struct task_struct *task;
344 308
345 cgroup_iter_start(cgroup, &it); 309 cgroup_iter_start(cgroup, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 310 while ((task = cgroup_iter_next(cgroup, &it))) {
347 __thaw_task(task); 311 thaw_process(task);
312 }
348 cgroup_iter_end(cgroup, &it); 313 cgroup_iter_end(cgroup, &it);
314
315 freezer->state = CGROUP_THAWED;
349} 316}
350 317
351/** 318static int freezer_change_state(struct cgroup *cgroup,
352 * freezer_apply_state - apply state change to a single cgroup_freezer 319 enum freezer_state goal_state)
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
362{ 320{
363 /* also synchronizes against task migration, see freezer_attach() */ 321 struct freezer *freezer;
364 lockdep_assert_held(&freezer->lock); 322 int retval = 0;
365 323
366 if (!(freezer->state & CGROUP_FREEZER_ONLINE)) 324 freezer = cgroup_freezer(cgroup);
367 return;
368 325
369 if (freeze) { 326 spin_lock_irq(&freezer->lock);
370 if (!(freezer->state & CGROUP_FREEZING))
371 atomic_inc(&system_freezing_cnt);
372 freezer->state |= state;
373 freeze_cgroup(freezer);
374 } else {
375 bool was_freezing = freezer->state & CGROUP_FREEZING;
376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
385 }
386}
387 327
388/** 328 update_if_frozen(cgroup, freezer);
389 * freezer_change_state - change the freezing state of a cgroup_freezer 329 if (goal_state == freezer->state)
390 * @freezer: freezer of interest 330 goto out;
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399 331
400 /* update @freezer */ 332 switch (goal_state) {
401 spin_lock_irq(&freezer->lock); 333 case CGROUP_THAWED:
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); 334 unfreeze_cgroup(cgroup, freezer);
335 break;
336 case CGROUP_FROZEN:
337 retval = try_to_freeze_cgroup(cgroup, freezer);
338 break;
339 default:
340 BUG();
341 }
342out:
403 spin_unlock_irq(&freezer->lock); 343 spin_unlock_irq(&freezer->lock);
404 344
405 /* 345 return retval;
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
426} 346}
427 347
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 348static int freezer_write(struct cgroup *cgroup,
349 struct cftype *cft,
429 const char *buffer) 350 const char *buffer)
430{ 351{
431 bool freeze; 352 int retval;
353 enum freezer_state goal_state;
432 354
433 if (strcmp(buffer, freezer_state_strs(0)) == 0) 355 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
434 freeze = false; 356 goal_state = CGROUP_THAWED;
435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) 357 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
436 freeze = true; 358 goal_state = CGROUP_FROZEN;
437 else 359 else
438 return -EINVAL; 360 return -EINVAL;
439 361
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 362 if (!cgroup_lock_live_group(cgroup))
441 return 0; 363 return -ENODEV;
442} 364 retval = freezer_change_state(cgroup, goal_state);
443 365 cgroup_unlock();
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 366 return retval;
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 367}
457 368
458static struct cftype files[] = { 369static struct cftype files[] = {
459 { 370 {
460 .name = "state", 371 .name = "state",
461 .flags = CFTYPE_NOT_ON_ROOT,
462 .read_seq_string = freezer_read, 372 .read_seq_string = freezer_read,
463 .write_string = freezer_write, 373 .write_string = freezer_write,
464 }, 374 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
475 { } /* terminate */
476}; 375};
477 376
377static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
378{
379 if (!cgroup->parent)
380 return 0;
381 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
382}
383
478struct cgroup_subsys freezer_subsys = { 384struct cgroup_subsys freezer_subsys = {
479 .name = "freezer", 385 .name = "freezer",
480 .css_alloc = freezer_css_alloc, 386 .create = freezer_create,
481 .css_online = freezer_css_online, 387 .destroy = freezer_destroy,
482 .css_offline = freezer_css_offline, 388 .populate = freezer_populate,
483 .css_free = freezer_css_free,
484 .subsys_id = freezer_subsys_id, 389 .subsys_id = freezer_subsys_id,
485 .attach = freezer_attach, 390 .can_attach = freezer_can_attach,
391 .can_attach_task = freezer_can_attach_task,
392 .pre_attach = NULL,
393 .attach_task = NULL,
394 .attach = NULL,
486 .fork = freezer_fork, 395 .fork = freezer_fork,
487 .base_cftypes = files, 396 .exit = NULL,
488}; 397};
diff --git a/kernel/compat.c b/kernel/compat.c
index f6150e92dfc..e2435ee9993 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,7 +21,6 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/export.h>
25#include <linux/migrate.h> 24#include <linux/migrate.h>
26#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
27#include <linux/times.h> 26#include <linux/times.h>
@@ -31,10 +30,11 @@
31#include <asm/uaccess.h> 30#include <asm/uaccess.h>
32 31
33/* 32/*
34 * Get/set struct timeval with struct timespec on the native side 33 * Note that the native side is already converted to a timespec, because
34 * that's what we want anyway.
35 */ 35 */
36static int compat_get_timeval_convert(struct timespec *o, 36static int compat_get_timeval(struct timespec *o,
37 struct compat_timeval __user *i) 37 struct compat_timeval __user *i)
38{ 38{
39 long usec; 39 long usec;
40 40
@@ -45,8 +45,8 @@ static int compat_get_timeval_convert(struct timespec *o,
45 return 0; 45 return 0;
46} 46}
47 47
48static int compat_put_timeval_convert(struct compat_timeval __user *o, 48static int compat_put_timeval(struct compat_timeval __user *o,
49 struct timeval *i) 49 struct timeval *i)
50{ 50{
51 return (put_user(i->tv_sec, &o->tv_sec) || 51 return (put_user(i->tv_sec, &o->tv_sec) ||
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -116,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
116 if (tv) { 116 if (tv) {
117 struct timeval ktv; 117 struct timeval ktv;
118 do_gettimeofday(&ktv); 118 do_gettimeofday(&ktv);
119 if (compat_put_timeval_convert(tv, &ktv)) 119 if (compat_put_timeval(tv, &ktv))
120 return -EFAULT; 120 return -EFAULT;
121 } 121 }
122 if (tz) { 122 if (tz) {
@@ -134,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
134 struct timezone ktz; 134 struct timezone ktz;
135 135
136 if (tv) { 136 if (tv) {
137 if (compat_get_timeval_convert(&kts, tv)) 137 if (compat_get_timeval(&kts, tv))
138 return -EFAULT; 138 return -EFAULT;
139 } 139 }
140 if (tz) { 140 if (tz) {
@@ -145,29 +145,12 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
145 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); 145 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
146} 146}
147 147
148int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
149{
150 return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
151 __get_user(tv->tv_sec, &ctv->tv_sec) ||
152 __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
153}
154EXPORT_SYMBOL_GPL(get_compat_timeval);
155
156int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
157{
158 return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
159 __put_user(tv->tv_sec, &ctv->tv_sec) ||
160 __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
161}
162EXPORT_SYMBOL_GPL(put_compat_timeval);
163
164int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 148int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
165{ 149{
166 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 150 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
167 __get_user(ts->tv_sec, &cts->tv_sec) || 151 __get_user(ts->tv_sec, &cts->tv_sec) ||
168 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 152 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
169} 153}
170EXPORT_SYMBOL_GPL(get_compat_timespec);
171 154
172int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) 155int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
173{ 156{
@@ -177,42 +160,6 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
177} 160}
178EXPORT_SYMBOL_GPL(put_compat_timespec); 161EXPORT_SYMBOL_GPL(put_compat_timespec);
179 162
180int compat_get_timeval(struct timeval *tv, const void __user *utv)
181{
182 if (COMPAT_USE_64BIT_TIME)
183 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
184 else
185 return get_compat_timeval(tv, utv);
186}
187EXPORT_SYMBOL_GPL(compat_get_timeval);
188
189int compat_put_timeval(const struct timeval *tv, void __user *utv)
190{
191 if (COMPAT_USE_64BIT_TIME)
192 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
193 else
194 return put_compat_timeval(tv, utv);
195}
196EXPORT_SYMBOL_GPL(compat_put_timeval);
197
198int compat_get_timespec(struct timespec *ts, const void __user *uts)
199{
200 if (COMPAT_USE_64BIT_TIME)
201 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
202 else
203 return get_compat_timespec(ts, uts);
204}
205EXPORT_SYMBOL_GPL(compat_get_timespec);
206
207int compat_put_timespec(const struct timespec *ts, void __user *uts)
208{
209 if (COMPAT_USE_64BIT_TIME)
210 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
211 else
212 return put_compat_timespec(ts, uts);
213}
214EXPORT_SYMBOL_GPL(compat_put_timespec);
215
216static long compat_nanosleep_restart(struct restart_block *restart) 163static long compat_nanosleep_restart(struct restart_block *restart)
217{ 164{
218 struct compat_timespec __user *rmtp; 165 struct compat_timespec __user *rmtp;
@@ -372,54 +319,25 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
372 319
373#ifdef __ARCH_WANT_SYS_SIGPROCMASK 320#ifdef __ARCH_WANT_SYS_SIGPROCMASK
374 321
375/* 322asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
376 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the 323 compat_old_sigset_t __user *oset)
377 * blocked set of signals to the supplied signal set
378 */
379static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
380{ 324{
381 memcpy(blocked->sig, &set, sizeof(set)); 325 old_sigset_t s;
382} 326 long ret;
383 327 mm_segment_t old_fs;
384asmlinkage long compat_sys_sigprocmask(int how,
385 compat_old_sigset_t __user *nset,
386 compat_old_sigset_t __user *oset)
387{
388 old_sigset_t old_set, new_set;
389 sigset_t new_blocked;
390
391 old_set = current->blocked.sig[0];
392
393 if (nset) {
394 if (get_user(new_set, nset))
395 return -EFAULT;
396 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
397
398 new_blocked = current->blocked;
399
400 switch (how) {
401 case SIG_BLOCK:
402 sigaddsetmask(&new_blocked, new_set);
403 break;
404 case SIG_UNBLOCK:
405 sigdelsetmask(&new_blocked, new_set);
406 break;
407 case SIG_SETMASK:
408 compat_sig_setmask(&new_blocked, new_set);
409 break;
410 default:
411 return -EINVAL;
412 }
413
414 set_current_blocked(&new_blocked);
415 }
416
417 if (oset) {
418 if (put_user(old_set, oset))
419 return -EFAULT;
420 }
421 328
422 return 0; 329 if (set && get_user(s, set))
330 return -EFAULT;
331 old_fs = get_fs();
332 set_fs(KERNEL_DS);
333 ret = sys_sigprocmask(how,
334 set ? (old_sigset_t __user *) &s : NULL,
335 oset ? (old_sigset_t __user *) &s : NULL);
336 set_fs(old_fs);
337 if (ret == 0)
338 if (oset)
339 ret = put_user(s, oset);
340 return ret;
423} 341}
424 342
425#endif 343#endif
@@ -1073,7 +991,15 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
1073 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) 991 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1074 return -EFAULT; 992 return -EFAULT;
1075 sigset_from_compat(&newset, &newset32); 993 sigset_from_compat(&newset, &newset32);
1076 return sigsuspend(&newset); 994 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
995
996 current->saved_sigmask = current->blocked;
997 set_current_blocked(&newset);
998
999 current->state = TASK_INTERRUPTIBLE;
1000 schedule();
1001 set_restore_sigmask();
1002 return -ERESTARTNOHAND;
1077} 1003}
1078#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 1004#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1079 1005
@@ -1215,23 +1141,6 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1141 return 0;
1216} 1142}
1217 1143
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1220 struct compat_timespec __user *interval)
1221{
1222 struct timespec t;
1223 int ret;
1224 mm_segment_t old_fs = get_fs();
1225
1226 set_fs(KERNEL_DS);
1227 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1228 set_fs(old_fs);
1229 if (put_compat_timespec(&t, interval))
1230 return -EFAULT;
1231 return ret;
1232}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234
1235/* 1144/*
1236 * Allocate user-space memory for the duration of a single system call, 1145 * Allocate user-space memory for the duration of a single system call,
1237 * in order to marshall parameters inside a compat thunk. 1146 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
deleted file mode 100644
index e0e07fd5550..00000000000
--- a/kernel/context_tracking.c
+++ /dev/null
@@ -1,83 +0,0 @@
1#include <linux/context_tracking.h>
2#include <linux/rcupdate.h>
3#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h>
6
7struct context_tracking {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true,
24#endif
25};
26
27void user_enter(void)
28{
29 unsigned long flags;
30
31 /*
32 * Some contexts may involve an exception occuring in an irq,
33 * leading to that nesting:
34 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
35 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
36 * helpers are enough to protect RCU uses inside the exception. So
37 * just return immediately if we detect we are in an IRQ.
38 */
39 if (in_interrupt())
40 return;
41
42 WARN_ON_ONCE(!current->mm);
43
44 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER);
48 rcu_user_enter();
49 }
50 local_irq_restore(flags);
51}
52
53void user_exit(void)
54{
55 unsigned long flags;
56
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt())
66 return;
67
68 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL);
71 rcu_user_exit();
72 }
73 local_irq_restore(flags);
74}
75
76void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next)
78{
79 if (__this_cpu_read(context_tracking.active)) {
80 clear_tsk_thread_flag(prev, TIF_NOHZ);
81 set_tsk_thread_flag(next, TIF_NOHZ);
82 }
83}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242..eae3d9b3957 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,18 +10,13 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/unistd.h> 11#include <linux/unistd.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/oom.h> 13#include <linux/module.h>
14#include <linux/rcupdate.h>
15#include <linux/export.h>
16#include <linux/bug.h>
17#include <linux/kthread.h> 14#include <linux/kthread.h>
18#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
19#include <linux/mutex.h> 16#include <linux/mutex.h>
20#include <linux/gfp.h> 17#include <linux/gfp.h>
21#include <linux/suspend.h> 18#include <linux/suspend.h>
22 19
23#include "smpboot.h"
24
25#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
26/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 21/* Serializes the updates to cpu_online_mask, cpu_present_mask */
27static DEFINE_MUTEX(cpu_add_remove_lock); 22static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -80,10 +75,6 @@ void put_online_cpus(void)
80 if (cpu_hotplug.active_writer == current) 75 if (cpu_hotplug.active_writer == current)
81 return; 76 return;
82 mutex_lock(&cpu_hotplug.lock); 77 mutex_lock(&cpu_hotplug.lock);
83
84 if (WARN_ON(!cpu_hotplug.refcount))
85 cpu_hotplug.refcount++; /* try to fix things up */
86
87 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 78 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
88 wake_up_process(cpu_hotplug.active_writer); 79 wake_up_process(cpu_hotplug.active_writer);
89 mutex_unlock(&cpu_hotplug.lock); 80 mutex_unlock(&cpu_hotplug.lock);
@@ -180,47 +171,6 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
180} 171}
181EXPORT_SYMBOL(unregister_cpu_notifier); 172EXPORT_SYMBOL(unregister_cpu_notifier);
182 173
183/**
184 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
185 * @cpu: a CPU id
186 *
187 * This function walks all processes, finds a valid mm struct for each one and
188 * then clears a corresponding bit in mm's cpumask. While this all sounds
189 * trivial, there are various non-obvious corner cases, which this function
190 * tries to solve in a safe manner.
191 *
192 * Also note that the function uses a somewhat relaxed locking scheme, so it may
193 * be called only for an already offlined CPU.
194 */
195void clear_tasks_mm_cpumask(int cpu)
196{
197 struct task_struct *p;
198
199 /*
200 * This function is called after the cpu is taken down and marked
201 * offline, so its not like new tasks will ever get this cpu set in
202 * their mm mask. -- Peter Zijlstra
203 * Thus, we may use rcu_read_lock() here, instead of grabbing
204 * full-fledged tasklist_lock.
205 */
206 WARN_ON(cpu_online(cpu));
207 rcu_read_lock();
208 for_each_process(p) {
209 struct task_struct *t;
210
211 /*
212 * Main thread might exit, but other threads may still have
213 * a valid mm. Find one.
214 */
215 t = find_lock_task_mm(p);
216 if (!t)
217 continue;
218 cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
219 task_unlock(t);
220 }
221 rcu_read_unlock();
222}
223
224static inline void check_for_tasks(int cpu) 174static inline void check_for_tasks(int cpu)
225{ 175{
226 struct task_struct *p; 176 struct task_struct *p;
@@ -228,7 +178,8 @@ static inline void check_for_tasks(int cpu)
228 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
229 for_each_process(p) { 179 for_each_process(p) {
230 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
231 (p->utime || p->stime)) 181 (!cputime_eq(p->utime, cputime_zero) ||
182 !cputime_eq(p->stime, cputime_zero)))
232 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
233 "(state = %ld, flags = %x)\n", 184 "(state = %ld, flags = %x)\n",
234 p->comm, task_pid_nr(p), cpu, 185 p->comm, task_pid_nr(p), cpu,
@@ -284,13 +235,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
284 __func__, cpu); 235 __func__, cpu);
285 goto out_release; 236 goto out_release;
286 } 237 }
287 smpboot_park_threads(cpu);
288 238
289 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 239 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
290 if (err) { 240 if (err) {
291 /* CPU didn't die: tell everyone. Can't complain. */ 241 /* CPU didn't die: tell everyone. Can't complain. */
292 smpboot_unpark_threads(cpu);
293 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 242 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
243
294 goto out_release; 244 goto out_release;
295 } 245 }
296 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
@@ -346,25 +296,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
346 int ret, nr_calls = 0; 296 int ret, nr_calls = 0;
347 void *hcpu = (void *)(long)cpu; 297 void *hcpu = (void *)(long)cpu;
348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 298 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
349 struct task_struct *idle;
350
351 cpu_hotplug_begin();
352
353 if (cpu_online(cpu) || !cpu_present(cpu)) {
354 ret = -EINVAL;
355 goto out;
356 }
357
358 idle = idle_thread_get(cpu);
359 if (IS_ERR(idle)) {
360 ret = PTR_ERR(idle);
361 goto out;
362 }
363 299
364 ret = smpboot_create_threads(cpu); 300 if (cpu_online(cpu) || !cpu_present(cpu))
365 if (ret) 301 return -EINVAL;
366 goto out;
367 302
303 cpu_hotplug_begin();
368 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
369 if (ret) { 305 if (ret) {
370 nr_calls--; 306 nr_calls--;
@@ -374,21 +310,17 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
374 } 310 }
375 311
376 /* Arch-specific enabling code. */ 312 /* Arch-specific enabling code. */
377 ret = __cpu_up(cpu, idle); 313 ret = __cpu_up(cpu);
378 if (ret != 0) 314 if (ret != 0)
379 goto out_notify; 315 goto out_notify;
380 BUG_ON(!cpu_online(cpu)); 316 BUG_ON(!cpu_online(cpu));
381 317
382 /* Wake the per cpu threads */
383 smpboot_unpark_threads(cpu);
384
385 /* Now call notifier in preparation. */ 318 /* Now call notifier in preparation. */
386 cpu_notify(CPU_ONLINE | mod, hcpu); 319 cpu_notify(CPU_ONLINE | mod, hcpu);
387 320
388out_notify: 321out_notify:
389 if (ret != 0) 322 if (ret != 0)
390 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 323 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
391out:
392 cpu_hotplug_done(); 324 cpu_hotplug_done();
393 325
394 return ret; 326 return ret;
@@ -430,7 +362,7 @@ int __cpuinit cpu_up(unsigned int cpu)
430 362
431 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 363 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
432 mutex_lock(&zonelists_mutex); 364 mutex_lock(&zonelists_mutex);
433 build_all_zonelists(NULL, NULL); 365 build_all_zonelists(NULL);
434 mutex_unlock(&zonelists_mutex); 366 mutex_unlock(&zonelists_mutex);
435 } 367 }
436#endif 368#endif
@@ -448,11 +380,18 @@ out:
448 cpu_maps_update_done(); 380 cpu_maps_update_done();
449 return err; 381 return err;
450} 382}
451EXPORT_SYMBOL_GPL(cpu_up);
452 383
453#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
454static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
455 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
456int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
457{ 396{
458 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -464,6 +403,7 @@ int disable_nonboot_cpus(void)
464 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
465 */ 404 */
466 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
467 407
468 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
469 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -479,6 +419,8 @@ int disable_nonboot_cpus(void)
479 } 419 }
480 } 420 }
481 421
422 arch_disable_nonboot_cpus_end();
423
482 if (!error) { 424 if (!error) {
483 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
484 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
@@ -528,7 +470,7 @@ out:
528 cpu_maps_update_done(); 470 cpu_maps_update_done();
529} 471}
530 472
531static int __init alloc_frozen_cpus(void) 473static int alloc_frozen_cpus(void)
532{ 474{
533 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
534 return -ENOMEM; 476 return -ENOMEM;
@@ -601,13 +543,8 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
601} 543}
602 544
603 545
604static int __init cpu_hotplug_pm_sync_init(void) 546int cpu_hotplug_pm_sync_init(void)
605{ 547{
606 /*
607 * cpu_hotplug_pm_callback has higher priority than x86
608 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
609 * to disable cpu hotplug to avoid cpu hotplug race.
610 */
611 pm_notifier(cpu_hotplug_pm_callback, 0); 548 pm_notifier(cpu_hotplug_pm_callback, 0);
612 return 0; 549 return 0;
613} 550}
@@ -731,3 +668,23 @@ void init_cpu_online(const struct cpumask *src)
731{ 668{
732 cpumask_copy(to_cpumask(cpu_online_bits), src); 669 cpumask_copy(to_cpumask(cpu_online_bits), src);
733} 670}
671
672static ATOMIC_NOTIFIER_HEAD(idle_notifier);
673
674void idle_notifier_register(struct notifier_block *n)
675{
676 atomic_notifier_chain_register(&idle_notifier, n);
677}
678EXPORT_SYMBOL_GPL(idle_notifier_register);
679
680void idle_notifier_unregister(struct notifier_block *n)
681{
682 atomic_notifier_chain_unregister(&idle_notifier, n);
683}
684EXPORT_SYMBOL_GPL(idle_notifier_unregister);
685
686void idle_notifier_call_chain(unsigned long val)
687{
688 atomic_notifier_call_chain(&idle_notifier, val, NULL);
689}
690EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
deleted file mode 100644
index 9656a3c3650..00000000000
--- a/kernel/cpu_pm.c
+++ /dev/null
@@ -1,233 +0,0 @@
1/*
2 * Copyright (C) 2011 Google, Inc.
3 *
4 * Author:
5 * Colin Cross <ccross@android.com>
6 *
7 * This software is licensed under the terms of the GNU General Public
8 * License version 2, as published by the Free Software Foundation, and
9 * may be copied, distributed, and modified under those terms.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/cpu_pm.h>
20#include <linux/module.h>
21#include <linux/notifier.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24
25static DEFINE_RWLOCK(cpu_pm_notifier_lock);
26static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
27
28static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
29{
30 int ret;
31
32 ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
33 nr_to_call, nr_calls);
34
35 return notifier_to_errno(ret);
36}
37
38/**
39 * cpu_pm_register_notifier - register a driver with cpu_pm
40 * @nb: notifier block to register
41 *
42 * Add a driver to a list of drivers that are notified about
43 * CPU and CPU cluster low power entry and exit.
44 *
45 * This function may sleep, and has the same return conditions as
46 * raw_notifier_chain_register.
47 */
48int cpu_pm_register_notifier(struct notifier_block *nb)
49{
50 unsigned long flags;
51 int ret;
52
53 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
54 ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
55 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
60
61/**
62 * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
63 * @nb: notifier block to be unregistered
64 *
65 * Remove a driver from the CPU PM notifier list.
66 *
67 * This function may sleep, and has the same return conditions as
68 * raw_notifier_chain_unregister.
69 */
70int cpu_pm_unregister_notifier(struct notifier_block *nb)
71{
72 unsigned long flags;
73 int ret;
74
75 write_lock_irqsave(&cpu_pm_notifier_lock, flags);
76 ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
77 write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
78
79 return ret;
80}
81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
82
83/**
84 * cpu_pm_enter - CPU low power entry notifier
85 *
86 * Notifies listeners that a single CPU is entering a low power state that may
87 * cause some blocks in the same power domain as the cpu to reset.
88 *
89 * Must be called on the affected CPU with interrupts disabled. Platform is
90 * responsible for ensuring that cpu_pm_enter is not called twice on the same
91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
92 * co-processor, interrupt controller and its PM extensions, local CPU
93 * timers context save/restore which shouldn't be interrupted. Hence it
94 * must be called with interrupts disabled.
95 *
96 * Return conditions are same as __raw_notifier_call_chain.
97 */
98int cpu_pm_enter(void)
99{
100 int nr_calls;
101 int ret = 0;
102
103 read_lock(&cpu_pm_notifier_lock);
104 ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
105 if (ret)
106 /*
107 * Inform listeners (nr_calls - 1) about failure of CPU PM
108 * PM entry who are notified earlier to prepare for it.
109 */
110 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
111 read_unlock(&cpu_pm_notifier_lock);
112
113 return ret;
114}
115EXPORT_SYMBOL_GPL(cpu_pm_enter);
116
117/**
118 * cpu_pm_exit - CPU low power exit notifier
119 *
120 * Notifies listeners that a single CPU is exiting a low power state that may
121 * have caused some blocks in the same power domain as the cpu to reset.
122 *
123 * Notified drivers can include VFP co-processor, interrupt controller
124 * and its PM extensions, local CPU timers context save/restore which
125 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
126 *
127 * Return conditions are same as __raw_notifier_call_chain.
128 */
129int cpu_pm_exit(void)
130{
131 int ret;
132
133 read_lock(&cpu_pm_notifier_lock);
134 ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
135 read_unlock(&cpu_pm_notifier_lock);
136
137 return ret;
138}
139EXPORT_SYMBOL_GPL(cpu_pm_exit);
140
141/**
142 * cpu_cluster_pm_enter - CPU cluster low power entry notifier
143 *
144 * Notifies listeners that all cpus in a power domain are entering a low power
145 * state that may cause some blocks in the same power domain to reset.
146 *
147 * Must be called after cpu_pm_enter has been called on all cpus in the power
148 * domain, and before cpu_pm_exit has been called on any cpu in the power
149 * domain. Notified drivers can include VFP co-processor, interrupt controller
150 * and its PM extensions, local CPU timers context save/restore which
151 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
152 *
153 * Must be called with interrupts disabled.
154 *
155 * Return conditions are same as __raw_notifier_call_chain.
156 */
157int cpu_cluster_pm_enter(void)
158{
159 int nr_calls;
160 int ret = 0;
161
162 read_lock(&cpu_pm_notifier_lock);
163 ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
164 if (ret)
165 /*
166 * Inform listeners (nr_calls - 1) about failure of CPU cluster
167 * PM entry who are notified earlier to prepare for it.
168 */
169 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
170 read_unlock(&cpu_pm_notifier_lock);
171
172 return ret;
173}
174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
175
176/**
177 * cpu_cluster_pm_exit - CPU cluster low power exit notifier
178 *
179 * Notifies listeners that all cpus in a power domain are exiting form a
180 * low power state that may have caused some blocks in the same power domain
181 * to reset.
182 *
183 * Must be called after cpu_pm_exit has been called on all cpus in the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and its PM extensions, local CPU timers context save/restore which
187 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
188 *
189 * Return conditions are same as __raw_notifier_call_chain.
190 */
191int cpu_cluster_pm_exit(void)
192{
193 int ret;
194
195 read_lock(&cpu_pm_notifier_lock);
196 ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
197 read_unlock(&cpu_pm_notifier_lock);
198
199 return ret;
200}
201EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
202
203#ifdef CONFIG_PM
204static int cpu_pm_suspend(void)
205{
206 int ret;
207
208 ret = cpu_pm_enter();
209 if (ret)
210 return ret;
211
212 ret = cpu_cluster_pm_enter();
213 return ret;
214}
215
216static void cpu_pm_resume(void)
217{
218 cpu_cluster_pm_exit();
219 cpu_pm_exit();
220}
221
222static struct syscore_ops cpu_pm_syscore_ops = {
223 .suspend = cpu_pm_suspend,
224 .resume = cpu_pm_resume,
225};
226
227static int cpu_pm_init(void)
228{
229 register_syscore_ops(&cpu_pm_syscore_ops);
230 return 0;
231}
232core_initcall(cpu_pm_init);
233#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb..10131fdaff7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h> 39#include <linux/memory.h>
40#include <linux/export.h> 40#include <linux/module.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/namei.h> 42#include <linux/namei.h>
43#include <linux/pagemap.h> 43#include <linux/pagemap.h>
@@ -123,19 +123,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 123 struct cpuset, css);
124} 124}
125 125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
139/* bits in struct cpuset flags field */ 126/* bits in struct cpuset flags field */
140typedef enum { 127typedef enum {
141 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
@@ -147,12 +134,6 @@ typedef enum {
147 CS_SPREAD_SLAB, 134 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 135} cpuset_flagbits_t;
149 136
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 137/* convenient tests for these bits */
157static inline int is_cpu_exclusive(const struct cpuset *cs) 138static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 139{
@@ -276,11 +257,11 @@ static struct file_system_type cpuset_fs_type = {
276 * are online. If none are online, walk up the cpuset hierarchy 257 * are online. If none are online, walk up the cpuset hierarchy
277 * until we find one that does have some online cpus. If we get 258 * until we find one that does have some online cpus. If we get
278 * all the way to the top and still haven't found any online cpus, 259 * all the way to the top and still haven't found any online cpus,
279 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing 260 * return cpu_online_map. Or if passed a NULL cs from an exit'ing
280 * task, return cpu_online_mask. 261 * task, return cpu_online_map.
281 * 262 *
282 * One way or another, we guarantee to return some non-empty subset 263 * One way or another, we guarantee to return some non-empty subset
283 * of cpu_online_mask. 264 * of cpu_online_map.
284 * 265 *
285 * Call with callback_mutex held. 266 * Call with callback_mutex held.
286 */ 267 */
@@ -302,10 +283,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 283 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 284 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 285 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_MEMORY]. 286 * found any online mems, return node_states[N_HIGH_MEMORY].
306 * 287 *
307 * One way or another, we guarantee to return some non-empty subset 288 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_MEMORY]. 289 * of node_states[N_HIGH_MEMORY].
309 * 290 *
310 * Call with callback_mutex held. 291 * Call with callback_mutex held.
311 */ 292 */
@@ -313,14 +294,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 294static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 295{
315 while (cs && !nodes_intersects(cs->mems_allowed, 296 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_MEMORY])) 297 node_states[N_HIGH_MEMORY]))
317 cs = cs->parent; 298 cs = cs->parent;
318 if (cs) 299 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 300 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_MEMORY]); 301 node_states[N_HIGH_MEMORY]);
321 else 302 else
322 *pmask = node_states[N_MEMORY]; 303 *pmask = node_states[N_HIGH_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); 304 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
324} 305}
325 306
326/* 307/*
@@ -873,7 +854,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
873 int retval; 854 int retval;
874 int is_load_balanced; 855 int is_load_balanced;
875 856
876 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 857 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
877 if (cs == &top_cpuset) 858 if (cs == &top_cpuset)
878 return -EACCES; 859 return -EACCES;
879 860
@@ -968,8 +949,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
968static void cpuset_change_task_nodemask(struct task_struct *tsk, 949static void cpuset_change_task_nodemask(struct task_struct *tsk,
969 nodemask_t *newmems) 950 nodemask_t *newmems)
970{ 951{
971 bool need_loop; 952repeat:
972
973 /* 953 /*
974 * Allow tasks that have access to memory reserves because they have 954 * Allow tasks that have access to memory reserves because they have
975 * been OOM killed to get memory anywhere. 955 * been OOM killed to get memory anywhere.
@@ -980,27 +960,46 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
980 return; 960 return;
981 961
982 task_lock(tsk); 962 task_lock(tsk);
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965
966
983 /* 967 /*
984 * Determine if a loop is necessary if another thread is doing 968 * ensure checking ->mems_allowed_change_disable after setting all new
985 * get_mems_allowed(). If at least one node remains unchanged and 969 * allowed nodes.
986 * tsk does not have a mempolicy, then an empty nodemask will not be 970 *
987 * possible when mems_allowed is larger than a word. 971 * the read-side task can see an nodemask with new allowed nodes and
972 * old allowed nodes. and if it allocates page when cpuset clears newly
973 * disallowed ones continuous, it can see the new allowed bits.
974 *
975 * And if setting all new allowed nodes is after the checking, setting
976 * all new allowed nodes and clearing newly disallowed ones will be done
977 * continuous, and the read-side task may find no node to alloc page.
988 */ 978 */
989 need_loop = task_has_mempolicy(tsk) || 979 smp_mb();
990 !nodes_intersects(*newmems, tsk->mems_allowed);
991 980
992 if (need_loop) 981 /*
993 write_seqcount_begin(&tsk->mems_allowed_seq); 982 * Allocation of memory is very fast, we needn't sleep when waiting
983 * for the read-side.
984 */
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk);
987 if (!task_curr(tsk))
988 yield();
989 goto repeat;
990 }
994 991
995 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 992 /*
996 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 993 * ensure checking ->mems_allowed_change_disable before clearing all new
994 * disallowed nodes.
995 *
996 * if clearing newly disallowed bits before the checking, the read-side
997 * task may find no node to alloc page.
998 */
999 smp_mb();
997 1000
998 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1001 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
999 tsk->mems_allowed = *newmems; 1002 tsk->mems_allowed = *newmems;
1000
1001 if (need_loop)
1002 write_seqcount_end(&tsk->mems_allowed_seq);
1003
1004 task_unlock(tsk); 1003 task_unlock(tsk);
1005} 1004}
1006 1005
@@ -1100,7 +1099,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1099 return -ENOMEM;
1101 1100
1102 /* 1101 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1102 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1104 * it's read-only 1103 * it's read-only
1105 */ 1104 */
1106 if (cs == &top_cpuset) { 1105 if (cs == &top_cpuset) {
@@ -1122,7 +1121,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1121 goto done;
1123 1122
1124 if (!nodes_subset(trialcs->mems_allowed, 1123 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_MEMORY])) { 1124 node_states[N_HIGH_MEMORY])) {
1126 retval = -EINVAL; 1125 retval = -EINVAL;
1127 goto done; 1126 goto done;
1128 } 1127 }
@@ -1368,71 +1367,79 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1367 return val;
1369} 1368}
1370 1369
1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1372 struct task_struct *tsk)
1373{
1374 struct cpuset *cs = cgroup_cs(cont);
1375
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1377 return -ENOSPC;
1378
1379 /*
1380 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1381 * cannot change their cpu affinity and isolating such threads by their
1382 * set of allowed nodes is unnecessary. Thus, cpusets are not
1383 * applicable for such threads. This prevents checking for success of
1384 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1385 * be changed.
1386 */
1387 if (tsk->flags & PF_THREAD_BOUND)
1388 return -EINVAL;
1389
1390 return 0;
1391}
1392
1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1394{
1395 return security_task_setscheduler(task);
1396}
1397
1371/* 1398/*
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because 1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must 1400 * dynamically allocating them is not allowed in pre_attach, and they must
1374 * persist until attach. 1401 * persist among pre_attach, attach_task, and attach.
1375 */ 1402 */
1376static cpumask_var_t cpus_attach; 1403static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from; 1404static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to; 1405static nodemask_t cpuset_attach_nodemask_to;
1379 1406
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1407/* Set-up work for before attaching each task. */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1408static void cpuset_pre_attach(struct cgroup *cont)
1382{ 1409{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1410 struct cpuset *cs = cgroup_cs(cont);
1384 struct task_struct *task;
1385 int ret;
1386
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC;
1389
1390 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /*
1392 * Kthreads bound to specific cpus cannot be moved to a new
1393 * cpuset; we cannot change their cpu affinity and
1394 * isolating such threads by their set of allowed nodes is
1395 * unnecessary. Thus, cpusets are not applicable for such
1396 * threads. This prevents checking for success of
1397 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed.
1399 */
1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL;
1402 if ((ret = security_task_setscheduler(task)))
1403 return ret;
1404 }
1405 1411
1406 /* prepare for attach */
1407 if (cs == &top_cpuset) 1412 if (cs == &top_cpuset)
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1409 else 1414 else
1410 guarantee_online_cpus(cs, cpus_attach); 1415 guarantee_online_cpus(cs, cpus_attach);
1411 1416
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1413
1414 return 0;
1415} 1418}
1416 1419
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1418{ 1422{
1419 struct mm_struct *mm; 1423 int err;
1420 struct task_struct *task; 1424 struct cpuset *cs = cgroup_cs(cont);
1421 struct task_struct *leader = cgroup_taskset_first(tset);
1422 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1423 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1425
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1426 /*
1427 /* 1427 * can_attach beforehand should guarantee that this doesn't fail.
1428 * can_attach beforehand should guarantee that this doesn't 1428 * TODO: have a better way to handle failure here
1429 * fail. TODO: have a better way to handle failure here 1429 */
1430 */ 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1431 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 1431 WARN_ON_ONCE(err);
1432 1432
1433 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1434 cpuset_update_task_spread_flag(cs, task); 1434 cpuset_update_task_spread_flag(cs, tsk);
1435 } 1435}
1436
1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1438 struct cgroup *oldcont, struct task_struct *tsk)
1439{
1440 struct mm_struct *mm;
1441 struct cpuset *cs = cgroup_cs(cont);
1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1436 1443
1437 /* 1444 /*
1438 * Change mm, possibly for multiple threads in a threadgroup. This is 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1440,7 +1447,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1440 */ 1447 */
1441 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1442 cpuset_attach_nodemask_to = cs->mems_allowed; 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1443 mm = get_task_mm(leader); 1450 mm = get_task_mm(tsk);
1444 if (mm) { 1451 if (mm) {
1445 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1446 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
@@ -1771,33 +1778,84 @@ static struct cftype files[] = {
1771 .write_u64 = cpuset_write_u64, 1778 .write_u64 = cpuset_write_u64,
1772 .private = FILE_SPREAD_SLAB, 1779 .private = FILE_SPREAD_SLAB,
1773 }, 1780 },
1781};
1774 1782
1775 { 1783static struct cftype cft_memory_pressure_enabled = {
1776 .name = "memory_pressure_enabled", 1784 .name = "memory_pressure_enabled",
1777 .flags = CFTYPE_ONLY_ON_ROOT, 1785 .read_u64 = cpuset_read_u64,
1778 .read_u64 = cpuset_read_u64, 1786 .write_u64 = cpuset_write_u64,
1779 .write_u64 = cpuset_write_u64, 1787 .private = FILE_MEMORY_PRESSURE_ENABLED,
1780 .private = FILE_MEMORY_PRESSURE_ENABLED,
1781 },
1782
1783 { } /* terminate */
1784}; 1788};
1785 1789
1790static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1791{
1792 int err;
1793
1794 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1795 if (err)
1796 return err;
1797 /* memory_pressure_enabled is in root cpuset only */
1798 if (!cont->parent)
1799 err = cgroup_add_file(cont, ss,
1800 &cft_memory_pressure_enabled);
1801 return err;
1802}
1803
1786/* 1804/*
1787 * cpuset_css_alloc - allocate a cpuset css 1805 * post_clone() is called during cgroup_create() when the
1806 * clone_children mount argument was specified. The cgroup
1807 * can not yet have any tasks.
1808 *
1809 * Currently we refuse to set up the cgroup - thereby
1810 * refusing the task to be entered, and as a result refusing
1811 * the sys_unshare() or clone() which initiated it - if any
1812 * sibling cpusets have exclusive cpus or mem.
1813 *
1814 * If this becomes a problem for some users who wish to
1815 * allow that scenario, then cpuset_post_clone() could be
1816 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1817 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1818 * held.
1819 */
1820static void cpuset_post_clone(struct cgroup_subsys *ss,
1821 struct cgroup *cgroup)
1822{
1823 struct cgroup *parent, *child;
1824 struct cpuset *cs, *parent_cs;
1825
1826 parent = cgroup->parent;
1827 list_for_each_entry(child, &parent->children, sibling) {
1828 cs = cgroup_cs(child);
1829 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1830 return;
1831 }
1832 cs = cgroup_cs(cgroup);
1833 parent_cs = cgroup_cs(parent);
1834
1835 mutex_lock(&callback_mutex);
1836 cs->mems_allowed = parent_cs->mems_allowed;
1837 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1838 mutex_unlock(&callback_mutex);
1839 return;
1840}
1841
1842/*
1843 * cpuset_create - create a cpuset
1844 * ss: cpuset cgroup subsystem
1788 * cont: control group that the new cpuset will be part of 1845 * cont: control group that the new cpuset will be part of
1789 */ 1846 */
1790 1847
1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1848static struct cgroup_subsys_state *cpuset_create(
1849 struct cgroup_subsys *ss,
1850 struct cgroup *cont)
1792{ 1851{
1793 struct cgroup *parent_cg = cont->parent; 1852 struct cpuset *cs;
1794 struct cgroup *tmp_cg; 1853 struct cpuset *parent;
1795 struct cpuset *parent, *cs;
1796 1854
1797 if (!parent_cg) 1855 if (!cont->parent) {
1798 return &top_cpuset.css; 1856 return &top_cpuset.css;
1799 parent = cgroup_cs(parent_cg); 1857 }
1800 1858 parent = cgroup_cs(cont->parent);
1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1859 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1802 if (!cs) 1860 if (!cs)
1803 return ERR_PTR(-ENOMEM); 1861 return ERR_PTR(-ENOMEM);
@@ -1819,36 +1877,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1819 1877
1820 cs->parent = parent; 1878 cs->parent = parent;
1821 number_of_cpusets++; 1879 number_of_cpusets++;
1822 1880 return &cs->css ;
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1852} 1881}
1853 1882
1854/* 1883/*
@@ -1857,7 +1886,7 @@ skip_clone:
1857 * will call async_rebuild_sched_domains(). 1886 * will call async_rebuild_sched_domains().
1858 */ 1887 */
1859 1888
1860static void cpuset_css_free(struct cgroup *cont) 1889static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1861{ 1890{
1862 struct cpuset *cs = cgroup_cs(cont); 1891 struct cpuset *cs = cgroup_cs(cont);
1863 1892
@@ -1871,12 +1900,16 @@ static void cpuset_css_free(struct cgroup *cont)
1871 1900
1872struct cgroup_subsys cpuset_subsys = { 1901struct cgroup_subsys cpuset_subsys = {
1873 .name = "cpuset", 1902 .name = "cpuset",
1874 .css_alloc = cpuset_css_alloc, 1903 .create = cpuset_create,
1875 .css_free = cpuset_css_free, 1904 .destroy = cpuset_destroy,
1876 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1877 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1910 .populate = cpuset_populate,
1911 .post_clone = cpuset_post_clone,
1878 .subsys_id = cpuset_subsys_id, 1912 .subsys_id = cpuset_subsys_id,
1879 .base_cftypes = files,
1880 .early_init = 1, 1913 .early_init = 1,
1881}; 1914};
1882 1915
@@ -1988,36 +2021,8 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1988} 2021}
1989 2022
1990/* 2023/*
1991 * Helper function to traverse cpusets. 2024 * Walk the specified cpuset subtree and look for empty cpusets.
1992 * It can be used to walk the cpuset tree from top to bottom, completing 2025 * The tasks of such cpuset must be moved to a parent cpuset.
1993 * one layer before dropping down to the next (thus always processing a
1994 * node before any of its children).
1995 */
1996static struct cpuset *cpuset_next(struct list_head *queue)
1997{
1998 struct cpuset *cp;
1999 struct cpuset *child; /* scans child cpusets of cp */
2000 struct cgroup *cont;
2001
2002 if (list_empty(queue))
2003 return NULL;
2004
2005 cp = list_first_entry(queue, struct cpuset, stack_list);
2006 list_del(queue->next);
2007 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2008 child = cgroup_cs(cont);
2009 list_add_tail(&child->stack_list, queue);
2010 }
2011
2012 return cp;
2013}
2014
2015
2016/*
2017 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2018 * online/offline) and update the cpusets accordingly.
2019 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2020 * cpuset must be moved to a parent cpuset.
2021 * 2026 *
2022 * Called with cgroup_mutex held. We take callback_mutex to modify 2027 * Called with cgroup_mutex held. We take callback_mutex to modify
2023 * cpus_allowed and mems_allowed. 2028 * cpus_allowed and mems_allowed.
@@ -2026,61 +2031,50 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2026 * before dropping down to the next. It always processes a node before 2031 * before dropping down to the next. It always processes a node before
2027 * any of its children. 2032 * any of its children.
2028 * 2033 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2034 * For now, since we lack memory hot unplug, we'll never see a cpuset
2030 * if all present pages from a node are offlined. 2035 * that has tasks along with an empty 'mems'. But if we did see such
2036 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2031 */ 2037 */
2032static void 2038static void scan_for_empty_cpusets(struct cpuset *root)
2033scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2034{ 2039{
2035 LIST_HEAD(queue); 2040 LIST_HEAD(queue);
2036 struct cpuset *cp; /* scans cpusets being updated */ 2041 struct cpuset *cp; /* scans cpusets being updated */
2042 struct cpuset *child; /* scans child cpusets of cp */
2043 struct cgroup *cont;
2037 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2044 static nodemask_t oldmems; /* protected by cgroup_mutex */
2038 2045
2039 list_add_tail((struct list_head *)&root->stack_list, &queue); 2046 list_add_tail((struct list_head *)&root->stack_list, &queue);
2040 2047
2041 switch (event) { 2048 while (!list_empty(&queue)) {
2042 case CPUSET_CPU_OFFLINE: 2049 cp = list_first_entry(&queue, struct cpuset, stack_list);
2043 while ((cp = cpuset_next(&queue)) != NULL) { 2050 list_del(queue.next);
2044 2051 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2045 /* Continue past cpusets with all cpus online */ 2052 child = cgroup_cs(cont);
2046 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2053 list_add_tail(&child->stack_list, &queue);
2047 continue;
2048
2049 /* Remove offline cpus from this cpuset. */
2050 mutex_lock(&callback_mutex);
2051 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2052 cpu_active_mask);
2053 mutex_unlock(&callback_mutex);
2054
2055 /* Move tasks from the empty cpuset to a parent */
2056 if (cpumask_empty(cp->cpus_allowed))
2057 remove_tasks_in_empty_cpuset(cp);
2058 else
2059 update_tasks_cpumask(cp, NULL);
2060 } 2054 }
2061 break;
2062 2055
2063 case CPUSET_MEM_OFFLINE: 2056 /* Continue past cpusets with all cpus, mems online */
2064 while ((cp = cpuset_next(&queue)) != NULL) { 2057 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2065 2058 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2066 /* Continue past cpusets with all mems online */ 2059 continue;
2067 if (nodes_subset(cp->mems_allowed,
2068 node_states[N_MEMORY]))
2069 continue;
2070 2060
2071 oldmems = cp->mems_allowed; 2061 oldmems = cp->mems_allowed;
2072 2062
2073 /* Remove offline mems from this cpuset. */ 2063 /* Remove offline cpus and mems from this cpuset. */
2074 mutex_lock(&callback_mutex); 2064 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed, 2065 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2076 node_states[N_MEMORY]); 2066 cpu_active_mask);
2077 mutex_unlock(&callback_mutex); 2067 nodes_and(cp->mems_allowed, cp->mems_allowed,
2068 node_states[N_HIGH_MEMORY]);
2069 mutex_unlock(&callback_mutex);
2078 2070
2079 /* Move tasks from the empty cpuset to a parent */ 2071 /* Move tasks from the empty cpuset to a parent */
2080 if (nodes_empty(cp->mems_allowed)) 2072 if (cpumask_empty(cp->cpus_allowed) ||
2081 remove_tasks_in_empty_cpuset(cp); 2073 nodes_empty(cp->mems_allowed))
2082 else 2074 remove_tasks_in_empty_cpuset(cp);
2083 update_tasks_nodemask(cp, &oldmems, NULL); 2075 else {
2076 update_tasks_cpumask(cp, NULL);
2077 update_tasks_nodemask(cp, &oldmems, NULL);
2084 } 2078 }
2085 } 2079 }
2086} 2080}
@@ -2091,19 +2085,13 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2091 * (of no affect) on systems that are actively using CPU hotplug 2085 * (of no affect) on systems that are actively using CPU hotplug
2092 * but making no active use of cpusets. 2086 * but making no active use of cpusets.
2093 * 2087 *
2094 * The only exception to this is suspend/resume, where we don't
2095 * modify cpusets at all.
2096 *
2097 * This routine ensures that top_cpuset.cpus_allowed tracks 2088 * This routine ensures that top_cpuset.cpus_allowed tracks
2098 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2089 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2099 * 2090 *
2100 * Called within get_online_cpus(). Needs to call cgroup_lock() 2091 * Called within get_online_cpus(). Needs to call cgroup_lock()
2101 * before calling generate_sched_domains(). 2092 * before calling generate_sched_domains().
2102 *
2103 * @cpu_online: Indicates whether this is a CPU online event (true) or
2104 * a CPU offline event (false).
2105 */ 2093 */
2106void cpuset_update_active_cpus(bool cpu_online) 2094void cpuset_update_active_cpus(void)
2107{ 2095{
2108 struct sched_domain_attr *attr; 2096 struct sched_domain_attr *attr;
2109 cpumask_var_t *doms; 2097 cpumask_var_t *doms;
@@ -2113,10 +2101,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2113 mutex_lock(&callback_mutex); 2101 mutex_lock(&callback_mutex);
2114 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2102 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2115 mutex_unlock(&callback_mutex); 2103 mutex_unlock(&callback_mutex);
2116 2104 scan_for_empty_cpusets(&top_cpuset);
2117 if (!cpu_online)
2118 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2119
2120 ndoms = generate_sched_domains(&doms, &attr); 2105 ndoms = generate_sched_domains(&doms, &attr);
2121 cgroup_unlock(); 2106 cgroup_unlock();
2122 2107
@@ -2126,9 +2111,9 @@ void cpuset_update_active_cpus(bool cpu_online)
2126 2111
2127#ifdef CONFIG_MEMORY_HOTPLUG 2112#ifdef CONFIG_MEMORY_HOTPLUG
2128/* 2113/*
2129 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2114 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2130 * Call this routine anytime after node_states[N_MEMORY] changes. 2115 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2131 * See cpuset_update_active_cpus() for CPU hotplug handling. 2116 * See also the previous routine cpuset_track_online_cpus().
2132 */ 2117 */
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2118static int cpuset_track_online_nodes(struct notifier_block *self,
2134 unsigned long action, void *arg) 2119 unsigned long action, void *arg)
@@ -2140,16 +2125,16 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2140 case MEM_ONLINE: 2125 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed; 2126 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex); 2127 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2128 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2144 mutex_unlock(&callback_mutex); 2129 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL); 2130 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break; 2131 break;
2147 case MEM_OFFLINE: 2132 case MEM_OFFLINE:
2148 /* 2133 /*
2149 * needn't update top_cpuset.mems_allowed explicitly because 2134 * needn't update top_cpuset.mems_allowed explicitly because
2150 * scan_cpusets_upon_hotplug() will update it. 2135 * scan_for_empty_cpusets() will update it.
2151 */ 2136 */
2152 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); 2137 scan_for_empty_cpusets(&top_cpuset);
2153 break; 2138 break;
2154 default: 2139 default:
2155 break; 2140 break;
@@ -2169,7 +2154,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2169void __init cpuset_init_smp(void) 2154void __init cpuset_init_smp(void)
2170{ 2155{
2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2156 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2172 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2157 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2173 2158
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2159 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2160
@@ -2184,7 +2169,7 @@ void __init cpuset_init_smp(void)
2184 * 2169 *
2185 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 2170 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2186 * attached to the specified @tsk. Guaranteed to return some non-empty 2171 * attached to the specified @tsk. Guaranteed to return some non-empty
2187 * subset of cpu_online_mask, even if this means going outside the 2172 * subset of cpu_online_map, even if this means going outside the
2188 * tasks cpuset. 2173 * tasks cpuset.
2189 **/ 2174 **/
2190 2175
@@ -2197,9 +2182,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2197 mutex_unlock(&callback_mutex); 2182 mutex_unlock(&callback_mutex);
2198} 2183}
2199 2184
2200void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2185int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2201{ 2186{
2202 const struct cpuset *cs; 2187 const struct cpuset *cs;
2188 int cpu;
2203 2189
2204 rcu_read_lock(); 2190 rcu_read_lock();
2205 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
@@ -2220,10 +2206,22 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2220 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2206 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2221 * set any mask even if it is not right from task_cs() pov, 2207 * set any mask even if it is not right from task_cs() pov,
2222 * the pending set_cpus_allowed_ptr() will fix things. 2208 * the pending set_cpus_allowed_ptr() will fix things.
2223 *
2224 * select_fallback_rq() will fix things ups and set cpu_possible_mask
2225 * if required.
2226 */ 2209 */
2210
2211 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2212 if (cpu >= nr_cpu_ids) {
2213 /*
2214 * Either tsk->cpus_allowed is wrong (see above) or it
2215 * is actually empty. The latter case is only possible
2216 * if we are racing with remove_tasks_in_empty_cpuset().
2217 * Like above we can temporary set any mask and rely on
2218 * set_cpus_allowed_ptr() as synchronization point.
2219 */
2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2221 cpu = cpumask_any(cpu_active_mask);
2222 }
2223
2224 return cpu;
2227} 2225}
2228 2226
2229void cpuset_init_current_mems_allowed(void) 2227void cpuset_init_current_mems_allowed(void)
@@ -2237,7 +2235,7 @@ void cpuset_init_current_mems_allowed(void)
2237 * 2235 *
2238 * Description: Returns the nodemask_t mems_allowed of the cpuset 2236 * Description: Returns the nodemask_t mems_allowed of the cpuset
2239 * attached to the specified @tsk. Guaranteed to return some non-empty 2237 * attached to the specified @tsk. Guaranteed to return some non-empty
2240 * subset of node_states[N_MEMORY], even if this means going outside the 2238 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
2241 * tasks cpuset. 2239 * tasks cpuset.
2242 **/ 2240 **/
2243 2241
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b..5f85690285d 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
2#include <linux/crash_dump.h> 2#include <linux/crash_dump.h>
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/errno.h> 4#include <linux/errno.h>
5#include <linux/export.h> 5#include <linux/module.h>
6 6
7/* 7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need 8 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,15 +20,8 @@ unsigned long saved_max_pfn;
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21 21
22/* 22/*
23 * stores the size of elf header of crash image
24 */
25unsigned long long elfcorehdr_size;
26
27/*
28 * elfcorehdr= specifies the location of elf core header stored by the crashed 23 * elfcorehdr= specifies the location of elf core header stored by the crashed
29 * kernel. This option will be passed by kexec loader to the capture kernel. 24 * kernel. This option will be passed by kexec loader to the capture kernel.
30 *
31 * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
32 */ 25 */
33static int __init setup_elfcorehdr(char *arg) 26static int __init setup_elfcorehdr(char *arg)
34{ 27{
@@ -36,10 +29,6 @@ static int __init setup_elfcorehdr(char *arg)
36 if (!arg) 29 if (!arg)
37 return -EINVAL; 30 return -EINVAL;
38 elfcorehdr_addr = memparse(arg, &end); 31 elfcorehdr_addr = memparse(arg, &end);
39 if (*end == '@') {
40 elfcorehdr_size = elfcorehdr_addr;
41 elfcorehdr_addr = memparse(end + 1, &end);
42 }
43 return end > arg ? 0 : -EINVAL; 32 return end > arg ? 0 : -EINVAL;
44} 33}
45early_param("elfcorehdr", setup_elfcorehdr); 34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7d..8ef31f53c44 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11#include <linux/export.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
@@ -16,7 +16,6 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/binfmts.h>
20#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
21 20
22#if 0 21#if 0
@@ -30,6 +29,17 @@
30static struct kmem_cache *cred_jar; 29static struct kmem_cache *cred_jar;
31 30
32/* 31/*
32 * The common credentials for the initial task's thread group
33 */
34#ifdef CONFIG_KEYS
35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2),
37 .tgid = 0,
38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39};
40#endif
41
42/*
33 * The initial credentials for the initial task 43 * The initial credentials for the initial task
34 */ 44 */
35struct cred init_cred = { 45struct cred init_cred = {
@@ -38,14 +48,6 @@ struct cred init_cred = {
38 .subscribers = ATOMIC_INIT(2), 48 .subscribers = ATOMIC_INIT(2),
39 .magic = CRED_MAGIC, 49 .magic = CRED_MAGIC,
40#endif 50#endif
41 .uid = GLOBAL_ROOT_UID,
42 .gid = GLOBAL_ROOT_GID,
43 .suid = GLOBAL_ROOT_UID,
44 .sgid = GLOBAL_ROOT_GID,
45 .euid = GLOBAL_ROOT_UID,
46 .egid = GLOBAL_ROOT_GID,
47 .fsuid = GLOBAL_ROOT_UID,
48 .fsgid = GLOBAL_ROOT_GID,
49 .securebits = SECUREBITS_DEFAULT, 51 .securebits = SECUREBITS_DEFAULT,
50 .cap_inheritable = CAP_EMPTY_SET, 52 .cap_inheritable = CAP_EMPTY_SET,
51 .cap_permitted = CAP_FULL_SET, 53 .cap_permitted = CAP_FULL_SET,
@@ -54,6 +56,9 @@ struct cred init_cred = {
54 .user = INIT_USER, 56 .user = INIT_USER,
55 .user_ns = &init_user_ns, 57 .user_ns = &init_user_ns,
56 .group_info = &init_groups, 58 .group_info = &init_groups,
59#ifdef CONFIG_KEYS
60 .tgcred = &init_tgcred,
61#endif
57}; 62};
58 63
59static inline void set_cred_subscribers(struct cred *cred, int n) 64static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -82,6 +87,36 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
82} 87}
83 88
84/* 89/*
90 * Dispose of the shared task group credentials
91 */
92#ifdef CONFIG_KEYS
93static void release_tgcred_rcu(struct rcu_head *rcu)
94{
95 struct thread_group_cred *tgcred =
96 container_of(rcu, struct thread_group_cred, rcu);
97
98 BUG_ON(atomic_read(&tgcred->usage) != 0);
99
100 key_put(tgcred->session_keyring);
101 key_put(tgcred->process_keyring);
102 kfree(tgcred);
103}
104#endif
105
106/*
107 * Release a set of thread group credentials.
108 */
109static void release_tgcred(struct cred *cred)
110{
111#ifdef CONFIG_KEYS
112 struct thread_group_cred *tgcred = cred->tgcred;
113
114 if (atomic_dec_and_test(&tgcred->usage))
115 call_rcu(&tgcred->rcu, release_tgcred_rcu);
116#endif
117}
118
119/*
85 * The RCU callback to actually dispose of a set of credentials 120 * The RCU callback to actually dispose of a set of credentials
86 */ 121 */
87static void put_cred_rcu(struct rcu_head *rcu) 122static void put_cred_rcu(struct rcu_head *rcu)
@@ -106,14 +141,12 @@ static void put_cred_rcu(struct rcu_head *rcu)
106#endif 141#endif
107 142
108 security_cred_free(cred); 143 security_cred_free(cred);
109 key_put(cred->session_keyring);
110 key_put(cred->process_keyring);
111 key_put(cred->thread_keyring); 144 key_put(cred->thread_keyring);
112 key_put(cred->request_key_auth); 145 key_put(cred->request_key_auth);
146 release_tgcred(cred);
113 if (cred->group_info) 147 if (cred->group_info)
114 put_group_info(cred->group_info); 148 put_group_info(cred->group_info);
115 free_uid(cred->user); 149 free_uid(cred->user);
116 put_user_ns(cred->user_ns);
117 kmem_cache_free(cred_jar, cred); 150 kmem_cache_free(cred_jar, cred);
118} 151}
119 152
@@ -164,6 +197,13 @@ void exit_creds(struct task_struct *tsk)
164 validate_creds(cred); 197 validate_creds(cred);
165 alter_cred_subscribers(cred, -1); 198 alter_cred_subscribers(cred, -1);
166 put_cred(cred); 199 put_cred(cred);
200
201 cred = (struct cred *) tsk->replacement_session_keyring;
202 if (cred) {
203 tsk->replacement_session_keyring = NULL;
204 validate_creds(cred);
205 put_cred(cred);
206 }
167} 207}
168 208
169/** 209/**
@@ -203,6 +243,15 @@ struct cred *cred_alloc_blank(void)
203 if (!new) 243 if (!new)
204 return NULL; 244 return NULL;
205 245
246#ifdef CONFIG_KEYS
247 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
248 if (!new->tgcred) {
249 kmem_cache_free(cred_jar, new);
250 return NULL;
251 }
252 atomic_set(&new->tgcred->usage, 1);
253#endif
254
206 atomic_set(&new->usage, 1); 255 atomic_set(&new->usage, 1);
207#ifdef CONFIG_DEBUG_CREDENTIALS 256#ifdef CONFIG_DEBUG_CREDENTIALS
208 new->magic = CRED_MAGIC; 257 new->magic = CRED_MAGIC;
@@ -253,13 +302,11 @@ struct cred *prepare_creds(void)
253 set_cred_subscribers(new, 0); 302 set_cred_subscribers(new, 0);
254 get_group_info(new->group_info); 303 get_group_info(new->group_info);
255 get_uid(new->user); 304 get_uid(new->user);
256 get_user_ns(new->user_ns);
257 305
258#ifdef CONFIG_KEYS 306#ifdef CONFIG_KEYS
259 key_get(new->session_keyring);
260 key_get(new->process_keyring);
261 key_get(new->thread_keyring); 307 key_get(new->thread_keyring);
262 key_get(new->request_key_auth); 308 key_get(new->request_key_auth);
309 atomic_inc(&new->tgcred->usage);
263#endif 310#endif
264 311
265#ifdef CONFIG_SECURITY 312#ifdef CONFIG_SECURITY
@@ -283,20 +330,39 @@ EXPORT_SYMBOL(prepare_creds);
283 */ 330 */
284struct cred *prepare_exec_creds(void) 331struct cred *prepare_exec_creds(void)
285{ 332{
333 struct thread_group_cred *tgcred = NULL;
286 struct cred *new; 334 struct cred *new;
287 335
336#ifdef CONFIG_KEYS
337 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
338 if (!tgcred)
339 return NULL;
340#endif
341
288 new = prepare_creds(); 342 new = prepare_creds();
289 if (!new) 343 if (!new) {
344 kfree(tgcred);
290 return new; 345 return new;
346 }
291 347
292#ifdef CONFIG_KEYS 348#ifdef CONFIG_KEYS
293 /* newly exec'd tasks don't get a thread keyring */ 349 /* newly exec'd tasks don't get a thread keyring */
294 key_put(new->thread_keyring); 350 key_put(new->thread_keyring);
295 new->thread_keyring = NULL; 351 new->thread_keyring = NULL;
296 352
353 /* create a new per-thread-group creds for all this set of threads to
354 * share */
355 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
356
357 atomic_set(&tgcred->usage, 1);
358 spin_lock_init(&tgcred->lock);
359
297 /* inherit the session keyring; new process keyring */ 360 /* inherit the session keyring; new process keyring */
298 key_put(new->process_keyring); 361 key_get(tgcred->session_keyring);
299 new->process_keyring = NULL; 362 tgcred->process_keyring = NULL;
363
364 release_tgcred(new);
365 new->tgcred = tgcred;
300#endif 366#endif
301 367
302 return new; 368 return new;
@@ -313,6 +379,9 @@ struct cred *prepare_exec_creds(void)
313 */ 379 */
314int copy_creds(struct task_struct *p, unsigned long clone_flags) 380int copy_creds(struct task_struct *p, unsigned long clone_flags)
315{ 381{
382#ifdef CONFIG_KEYS
383 struct thread_group_cred *tgcred;
384#endif
316 struct cred *new; 385 struct cred *new;
317 int ret; 386 int ret;
318 387
@@ -342,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
342 goto error_put; 411 goto error_put;
343 } 412 }
344 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
345#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
346 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
347 * had one */ 421 * had one */
@@ -352,12 +426,22 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
352 install_thread_keyring_to_cred(new); 426 install_thread_keyring_to_cred(new);
353 } 427 }
354 428
355 /* The process keyring is only shared between the threads in a process; 429 /* we share the process and session keyrings between all the threads in
356 * anything outside of those threads doesn't inherit. 430 * a process - this is slightly icky as we violate COW credentials a
357 */ 431 * bit */
358 if (!(clone_flags & CLONE_THREAD)) { 432 if (!(clone_flags & CLONE_THREAD)) {
359 key_put(new->process_keyring); 433 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
360 new->process_keyring = NULL; 434 if (!tgcred) {
435 ret = -ENOMEM;
436 goto error_put;
437 }
438 atomic_set(&tgcred->usage, 1);
439 spin_lock_init(&tgcred->lock);
440 tgcred->process_keyring = NULL;
441 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
442
443 release_tgcred(new);
444 new->tgcred = tgcred;
361 } 445 }
362#endif 446#endif
363 447
@@ -372,31 +456,6 @@ error_put:
372 return ret; 456 return ret;
373} 457}
374 458
375static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
376{
377 const struct user_namespace *set_ns = set->user_ns;
378 const struct user_namespace *subset_ns = subset->user_ns;
379
380 /* If the two credentials are in the same user namespace see if
381 * the capabilities of subset are a subset of set.
382 */
383 if (set_ns == subset_ns)
384 return cap_issubset(subset->cap_permitted, set->cap_permitted);
385
386 /* The credentials are in a different user namespaces
387 * therefore one is a subset of the other only if a set is an
388 * ancestor of subset and set->euid is owner of subset or one
389 * of subsets ancestors.
390 */
391 for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
392 if ((set_ns == subset_ns->parent) &&
393 uid_eq(subset_ns->owner, set->euid))
394 return true;
395 }
396
397 return false;
398}
399
400/** 459/**
401 * commit_creds - Install new credentials upon the current task 460 * commit_creds - Install new credentials upon the current task
402 * @new: The credentials to be assigned 461 * @new: The credentials to be assigned
@@ -431,11 +490,11 @@ int commit_creds(struct cred *new)
431 get_cred(new); /* we will require a ref for the subj creds too */ 490 get_cred(new); /* we will require a ref for the subj creds too */
432 491
433 /* dumpability changes */ 492 /* dumpability changes */
434 if (!uid_eq(old->euid, new->euid) || 493 if (old->euid != new->euid ||
435 !gid_eq(old->egid, new->egid) || 494 old->egid != new->egid ||
436 !uid_eq(old->fsuid, new->fsuid) || 495 old->fsuid != new->fsuid ||
437 !gid_eq(old->fsgid, new->fsgid) || 496 old->fsgid != new->fsgid ||
438 !cred_cap_issubset(old, new)) { 497 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
439 if (task->mm) 498 if (task->mm)
440 set_dumpable(task->mm, suid_dumpable); 499 set_dumpable(task->mm, suid_dumpable);
441 task->pdeath_signal = 0; 500 task->pdeath_signal = 0;
@@ -443,9 +502,9 @@ int commit_creds(struct cred *new)
443 } 502 }
444 503
445 /* alter the thread keyring */ 504 /* alter the thread keyring */
446 if (!uid_eq(new->fsuid, old->fsuid)) 505 if (new->fsuid != old->fsuid)
447 key_fsuid_changed(task); 506 key_fsuid_changed(task);
448 if (!gid_eq(new->fsgid, old->fsgid)) 507 if (new->fsgid != old->fsgid)
449 key_fsgid_changed(task); 508 key_fsgid_changed(task);
450 509
451 /* do it 510 /* do it
@@ -462,16 +521,16 @@ int commit_creds(struct cred *new)
462 alter_cred_subscribers(old, -2); 521 alter_cred_subscribers(old, -2);
463 522
464 /* send notifications */ 523 /* send notifications */
465 if (!uid_eq(new->uid, old->uid) || 524 if (new->uid != old->uid ||
466 !uid_eq(new->euid, old->euid) || 525 new->euid != old->euid ||
467 !uid_eq(new->suid, old->suid) || 526 new->suid != old->suid ||
468 !uid_eq(new->fsuid, old->fsuid)) 527 new->fsuid != old->fsuid)
469 proc_id_connector(task, PROC_EVENT_UID); 528 proc_id_connector(task, PROC_EVENT_UID);
470 529
471 if (!gid_eq(new->gid, old->gid) || 530 if (new->gid != old->gid ||
472 !gid_eq(new->egid, old->egid) || 531 new->egid != old->egid ||
473 !gid_eq(new->sgid, old->sgid) || 532 new->sgid != old->sgid ||
474 !gid_eq(new->fsgid, old->fsgid)) 533 new->fsgid != old->fsgid)
475 proc_id_connector(task, PROC_EVENT_GID); 534 proc_id_connector(task, PROC_EVENT_GID);
476 535
477 /* release the old obj and subj refs both */ 536 /* release the old obj and subj refs both */
@@ -605,14 +664,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
605 atomic_set(&new->usage, 1); 664 atomic_set(&new->usage, 1);
606 set_cred_subscribers(new, 0); 665 set_cred_subscribers(new, 0);
607 get_uid(new->user); 666 get_uid(new->user);
608 get_user_ns(new->user_ns);
609 get_group_info(new->group_info); 667 get_group_info(new->group_info);
610 668
611#ifdef CONFIG_KEYS 669#ifdef CONFIG_KEYS
612 new->session_keyring = NULL; 670 atomic_inc(&init_tgcred.usage);
613 new->process_keyring = NULL; 671 new->tgcred = &init_tgcred;
614 new->thread_keyring = NULL;
615 new->request_key_auth = NULL; 672 new->request_key_auth = NULL;
673 new->thread_keyring = NULL;
616 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 674 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
617#endif 675#endif
618 676
@@ -727,15 +785,9 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
727 atomic_read(&cred->usage), 785 atomic_read(&cred->usage),
728 read_cred_subscribers(cred)); 786 read_cred_subscribers(cred));
729 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", 787 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
730 from_kuid_munged(&init_user_ns, cred->uid), 788 cred->uid, cred->euid, cred->suid, cred->fsuid);
731 from_kuid_munged(&init_user_ns, cred->euid),
732 from_kuid_munged(&init_user_ns, cred->suid),
733 from_kuid_munged(&init_user_ns, cred->fsuid));
734 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", 789 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
735 from_kgid_munged(&init_user_ns, cred->gid), 790 cred->gid, cred->egid, cred->sgid, cred->fsgid);
736 from_kgid_munged(&init_user_ns, cred->egid),
737 from_kgid_munged(&init_user_ns, cred->sgid),
738 from_kgid_munged(&init_user_ns, cred->fsgid));
739#ifdef CONFIG_SECURITY 791#ifdef CONFIG_SECURITY
740 printk(KERN_ERR "CRED: ->security is %p\n", cred->security); 792 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
741 if ((unsigned long) cred->security >= PAGE_SIZE && 793 if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc..0d7c08784ef 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,7 +41,6 @@
41#include <linux/delay.h> 41#include <linux/delay.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/sysrq.h> 43#include <linux/sysrq.h>
44#include <linux/reboot.h>
45#include <linux/init.h> 44#include <linux/init.h>
46#include <linux/kgdb.h> 45#include <linux/kgdb.h>
47#include <linux/kdb.h> 46#include <linux/kdb.h>
@@ -53,6 +52,7 @@
53#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
54#include <asm/byteorder.h> 53#include <asm/byteorder.h>
55#include <linux/atomic.h> 54#include <linux/atomic.h>
55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
58 58
@@ -75,8 +75,6 @@ static int exception_level;
75struct kgdb_io *dbg_io_ops; 75struct kgdb_io *dbg_io_ops;
76static DEFINE_SPINLOCK(kgdb_registration_lock); 76static DEFINE_SPINLOCK(kgdb_registration_lock);
77 77
78/* Action for the reboot notifiter, a global allow kdb to change it */
79static int kgdbreboot;
80/* kgdb console driver is loaded */ 78/* kgdb console driver is loaded */
81static int kgdb_con_registered; 79static int kgdb_con_registered;
82/* determine if kgdb console output should be used */ 80/* determine if kgdb console output should be used */
@@ -98,7 +96,6 @@ static int __init opt_kgdb_con(char *str)
98early_param("kgdbcon", opt_kgdb_con); 96early_param("kgdbcon", opt_kgdb_con);
99 97
100module_param(kgdb_use_con, int, 0644); 98module_param(kgdb_use_con, int, 0644);
101module_param(kgdbreboot, int, 0644);
102 99
103/* 100/*
104 * Holds information about breakpoints in a kernel. These breakpoints are 101 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -160,39 +157,37 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
160 * Weak aliases for breakpoint management, 157 * Weak aliases for breakpoint management,
161 * can be overriden by architectures when needed: 158 * can be overriden by architectures when needed:
162 */ 159 */
163int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) 160int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
164{ 161{
165 int err; 162 int err;
166 163
167 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, 164 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
168 BREAK_INSTR_SIZE);
169 if (err) 165 if (err)
170 return err; 166 return err;
171 err = probe_kernel_write((char *)bpt->bpt_addr, 167
172 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); 168 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
173 return err; 169 BREAK_INSTR_SIZE);
174} 170}
175 171
176int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) 172int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
177{ 173{
178 return probe_kernel_write((char *)bpt->bpt_addr, 174 return probe_kernel_write((char *)addr,
179 (char *)bpt->saved_instr, BREAK_INSTR_SIZE); 175 (char *)bundle, BREAK_INSTR_SIZE);
180} 176}
181 177
182int __weak kgdb_validate_break_address(unsigned long addr) 178int __weak kgdb_validate_break_address(unsigned long addr)
183{ 179{
184 struct kgdb_bkpt tmp; 180 char tmp_variable[BREAK_INSTR_SIZE];
185 int err; 181 int err;
186 /* Validate setting the breakpoint and then removing it. If the 182 /* Validate setting the breakpoint and then removing it. In the
187 * remove fails, the kernel needs to emit a bad message because we 183 * remove fails, the kernel needs to emit a bad message because we
188 * are deep trouble not being able to put things back the way we 184 * are deep trouble not being able to put things back the way we
189 * found them. 185 * found them.
190 */ 186 */
191 tmp.bpt_addr = addr; 187 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
192 err = kgdb_arch_set_breakpoint(&tmp);
193 if (err) 188 if (err)
194 return err; 189 return err;
195 err = kgdb_arch_remove_breakpoint(&tmp); 190 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
196 if (err) 191 if (err)
197 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 192 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
198 "memory destroyed at: %lx", addr); 193 "memory destroyed at: %lx", addr);
@@ -236,6 +231,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
236 */ 231 */
237int dbg_activate_sw_breakpoints(void) 232int dbg_activate_sw_breakpoints(void)
238{ 233{
234 unsigned long addr;
239 int error; 235 int error;
240 int ret = 0; 236 int ret = 0;
241 int i; 237 int i;
@@ -244,15 +240,16 @@ int dbg_activate_sw_breakpoints(void)
244 if (kgdb_break[i].state != BP_SET) 240 if (kgdb_break[i].state != BP_SET)
245 continue; 241 continue;
246 242
247 error = kgdb_arch_set_breakpoint(&kgdb_break[i]); 243 addr = kgdb_break[i].bpt_addr;
244 error = kgdb_arch_set_breakpoint(addr,
245 kgdb_break[i].saved_instr);
248 if (error) { 246 if (error) {
249 ret = error; 247 ret = error;
250 printk(KERN_INFO "KGDB: BP install failed: %lx", 248 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
251 kgdb_break[i].bpt_addr);
252 continue; 249 continue;
253 } 250 }
254 251
255 kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); 252 kgdb_flush_swbreak_addr(addr);
256 kgdb_break[i].state = BP_ACTIVE; 253 kgdb_break[i].state = BP_ACTIVE;
257 } 254 }
258 return ret; 255 return ret;
@@ -301,6 +298,7 @@ int dbg_set_sw_break(unsigned long addr)
301 298
302int dbg_deactivate_sw_breakpoints(void) 299int dbg_deactivate_sw_breakpoints(void)
303{ 300{
301 unsigned long addr;
304 int error; 302 int error;
305 int ret = 0; 303 int ret = 0;
306 int i; 304 int i;
@@ -308,14 +306,15 @@ int dbg_deactivate_sw_breakpoints(void)
308 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 306 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
309 if (kgdb_break[i].state != BP_ACTIVE) 307 if (kgdb_break[i].state != BP_ACTIVE)
310 continue; 308 continue;
311 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 309 addr = kgdb_break[i].bpt_addr;
310 error = kgdb_arch_remove_breakpoint(addr,
311 kgdb_break[i].saved_instr);
312 if (error) { 312 if (error) {
313 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", 313 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
314 kgdb_break[i].bpt_addr);
315 ret = error; 314 ret = error;
316 } 315 }
317 316
318 kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); 317 kgdb_flush_swbreak_addr(addr);
319 kgdb_break[i].state = BP_SET; 318 kgdb_break[i].state = BP_SET;
320 } 319 }
321 return ret; 320 return ret;
@@ -349,6 +348,7 @@ int kgdb_isremovedbreak(unsigned long addr)
349 348
350int dbg_remove_all_break(void) 349int dbg_remove_all_break(void)
351{ 350{
351 unsigned long addr;
352 int error; 352 int error;
353 int i; 353 int i;
354 354
@@ -356,10 +356,12 @@ int dbg_remove_all_break(void)
356 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 356 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
357 if (kgdb_break[i].state != BP_ACTIVE) 357 if (kgdb_break[i].state != BP_ACTIVE)
358 goto setundefined; 358 goto setundefined;
359 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 359 addr = kgdb_break[i].bpt_addr;
360 error = kgdb_arch_remove_breakpoint(addr,
361 kgdb_break[i].saved_instr);
360 if (error) 362 if (error)
361 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 363 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
362 kgdb_break[i].bpt_addr); 364 addr);
363setundefined: 365setundefined:
364 kgdb_break[i].state = BP_UNDEFINED; 366 kgdb_break[i].state = BP_UNDEFINED;
365 } 367 }
@@ -672,10 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
672{ 674{
673 struct kgdb_state kgdb_var; 675 struct kgdb_state kgdb_var;
674 struct kgdb_state *ks = &kgdb_var; 676 struct kgdb_state *ks = &kgdb_var;
675 int ret = 0;
676
677 if (arch_kgdb_ops.enable_nmi)
678 arch_kgdb_ops.enable_nmi(0);
679 677
680 ks->cpu = raw_smp_processor_id(); 678 ks->cpu = raw_smp_processor_id();
681 ks->ex_vector = evector; 679 ks->ex_vector = evector;
@@ -685,33 +683,13 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
685 ks->linux_regs = regs; 683 ks->linux_regs = regs;
686 684
687 if (kgdb_reenter_check(ks)) 685 if (kgdb_reenter_check(ks))
688 goto out; /* Ouch, double exception ! */ 686 return 0; /* Ouch, double exception ! */
689 if (kgdb_info[ks->cpu].enter_kgdb != 0) 687 if (kgdb_info[ks->cpu].enter_kgdb != 0)
690 goto out; 688 return 0;
691
692 ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
693out:
694 if (arch_kgdb_ops.enable_nmi)
695 arch_kgdb_ops.enable_nmi(1);
696 return ret;
697}
698
699/*
700 * GDB places a breakpoint at this function to know dynamically
701 * loaded objects. It's not defined static so that only one instance with this
702 * name exists in the kernel.
703 */
704 689
705static int module_event(struct notifier_block *self, unsigned long val, 690 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
706 void *data)
707{
708 return 0;
709} 691}
710 692
711static struct notifier_block dbg_module_load_nb = {
712 .notifier_call = module_event,
713};
714
715int kgdb_nmicallback(int cpu, void *regs) 693int kgdb_nmicallback(int cpu, void *regs)
716{ 694{
717#ifdef CONFIG_SMP 695#ifdef CONFIG_SMP
@@ -806,33 +784,6 @@ void __init dbg_late_init(void)
806 kdb_init(KDB_INIT_FULL); 784 kdb_init(KDB_INIT_FULL);
807} 785}
808 786
809static int
810dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
811{
812 /*
813 * Take the following action on reboot notify depending on value:
814 * 1 == Enter debugger
815 * 0 == [the default] detatch debug client
816 * -1 == Do nothing... and use this until the board resets
817 */
818 switch (kgdbreboot) {
819 case 1:
820 kgdb_breakpoint();
821 case -1:
822 goto done;
823 }
824 if (!dbg_kdb_mode)
825 gdbstub_exit(code);
826done:
827 return NOTIFY_DONE;
828}
829
830static struct notifier_block dbg_reboot_notifier = {
831 .notifier_call = dbg_notify_reboot,
832 .next = NULL,
833 .priority = INT_MAX,
834};
835
836static void kgdb_register_callbacks(void) 787static void kgdb_register_callbacks(void)
837{ 788{
838 if (!kgdb_io_module_registered) { 789 if (!kgdb_io_module_registered) {
@@ -840,8 +791,6 @@ static void kgdb_register_callbacks(void)
840 kgdb_arch_init(); 791 kgdb_arch_init();
841 if (!dbg_is_early) 792 if (!dbg_is_early)
842 kgdb_arch_late(); 793 kgdb_arch_late();
843 register_module_notifier(&dbg_module_load_nb);
844 register_reboot_notifier(&dbg_reboot_notifier);
845 atomic_notifier_chain_register(&panic_notifier_list, 794 atomic_notifier_chain_register(&panic_notifier_list,
846 &kgdb_panic_event_nb); 795 &kgdb_panic_event_nb);
847#ifdef CONFIG_MAGIC_SYSRQ 796#ifdef CONFIG_MAGIC_SYSRQ
@@ -863,8 +812,6 @@ static void kgdb_unregister_callbacks(void)
863 */ 812 */
864 if (kgdb_io_module_registered) { 813 if (kgdb_io_module_registered) {
865 kgdb_io_module_registered = 0; 814 kgdb_io_module_registered = 0;
866 unregister_reboot_notifier(&dbg_reboot_notifier);
867 unregister_module_notifier(&dbg_module_load_nb);
868 atomic_notifier_chain_unregister(&panic_notifier_list, 815 atomic_notifier_chain_unregister(&panic_notifier_list,
869 &kgdb_panic_event_nb); 816 &kgdb_panic_event_nb);
870 kgdb_arch_exit(); 817 kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e06448..34872482315 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
217 217
218 /* Pack in hex chars */ 218 /* Pack in hex chars */
219 for (i = 0; i < wcount; i++) 219 for (i = 0; i < wcount; i++)
220 bufptr = hex_byte_pack(bufptr, s[i]); 220 bufptr = pack_hex_byte(bufptr, s[i]);
221 *bufptr = '\0'; 221 *bufptr = '\0';
222 222
223 /* Move up */ 223 /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
249 if (err) 249 if (err)
250 return NULL; 250 return NULL;
251 while (count > 0) { 251 while (count > 0) {
252 buf = hex_byte_pack(buf, *tmp); 252 buf = pack_hex_byte(buf, *tmp);
253 tmp++; 253 tmp++;
254 count--; 254 count--;
255 } 255 }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
411 limit = id + (BUF_THREAD_ID_SIZE / 2); 411 limit = id + (BUF_THREAD_ID_SIZE / 2);
412 while (id < limit) { 412 while (id < limit) {
413 if (!lzero || *id != 0) { 413 if (!lzero || *id != 0) {
414 pkt = hex_byte_pack(pkt, *id); 414 pkt = pack_hex_byte(pkt, *id);
415 lzero = 0; 415 lzero = 0;
416 } 416 }
417 id++; 417 id++;
418 } 418 }
419 419
420 if (lzero) 420 if (lzero)
421 pkt = hex_byte_pack(pkt, 0); 421 pkt = pack_hex_byte(pkt, 0);
422 422
423 return pkt; 423 return pkt;
424} 424}
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
486 dbg_remove_all_break(); 486 dbg_remove_all_break();
487 487
488 remcom_out_buffer[0] = 'S'; 488 remcom_out_buffer[0] = 'S';
489 hex_byte_pack(&remcom_out_buffer[1], ks->signo); 489 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
490} 490}
491 491
492static void gdb_get_regs_helper(struct kgdb_state *ks) 492static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
954 /* Reply to host that an exception has occurred */ 954 /* Reply to host that an exception has occurred */
955 ptr = remcom_out_buffer; 955 ptr = remcom_out_buffer;
956 *ptr++ = 'T'; 956 *ptr++ = 'T';
957 ptr = hex_byte_pack(ptr, ks->signo); 957 ptr = pack_hex_byte(ptr, ks->signo);
958 ptr += strlen(strcpy(ptr, "thread:")); 958 ptr += strlen(strcpy(ptr, "thread:"));
959 int_to_threadref(thref, shadow_pid(current->pid)); 959 int_to_threadref(thref, shadow_pid(current->pid));
960 ptr = pack_threadid(ptr, thref); 960 ptr = pack_threadid(ptr, thref);
@@ -1111,13 +1111,6 @@ void gdbstub_exit(int status)
1111 unsigned char checksum, ch, buffer[3]; 1111 unsigned char checksum, ch, buffer[3];
1112 int loop; 1112 int loop;
1113 1113
1114 if (!kgdb_connected)
1115 return;
1116 kgdb_connected = 0;
1117
1118 if (!dbg_io_ops || dbg_kdb_mode)
1119 return;
1120
1121 buffer[0] = 'W'; 1114 buffer[0] = 'W';
1122 buffer[1] = hex_asc_hi(status); 1115 buffer[1] = hex_asc_hi(status);
1123 buffer[2] = hex_asc_lo(status); 1116 buffer[2] = hex_asc_lo(status);
@@ -1136,6 +1129,5 @@ void gdbstub_exit(int status)
1136 dbg_io_ops->write_char(hex_asc_lo(checksum)); 1129 dbg_io_ops->write_char(hex_asc_lo(checksum));
1137 1130
1138 /* make sure the output is flushed, lest the bootloader clobber it */ 1131 /* make sure the output is flushed, lest the bootloader clobber it */
1139 if (dbg_io_ops->flush) 1132 dbg_io_ops->flush();
1140 dbg_io_ops->flush();
1141} 1133}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5..20059ef4459 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,6 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
153 } else { 153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n", 154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr); 155 __func__, bp->bp_addr);
156#ifdef CONFIG_DEBUG_RODATA
157 if (!bp->bp_type) {
158 kdb_printf("Software breakpoints are unavailable.\n"
159 " Change the kernel CONFIG_DEBUG_RODATA=n\n"
160 " OR use hw breaks: help bph\n");
161 }
162#endif
163 return 1; 156 return 1;
164 } 157 }
165 return 0; 158 return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e4..7179eac7b41 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kdb.h> 16#include <linux/kdb.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <asm/system.h>
18#include "kdb_private.h" 19#include "kdb_private.h"
19 20
20 21
@@ -129,8 +130,6 @@ kdb_bt(int argc, const char **argv)
129 } 130 }
130 /* Now the inactive tasks */ 131 /* Now the inactive tasks */
131 kdb_do_each_thread(g, p) { 132 kdb_do_each_thread(g, p) {
132 if (KDB_FLAG(CMD_INTERRUPT))
133 return 0;
134 if (task_curr(p)) 133 if (task_curr(p))
135 continue; 134 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt)) 135 if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d3..d9ca9aa481e 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,8 +11,6 @@
11#include <linux/kgdb.h> 11#include <linux/kgdb.h>
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h>
15#include <linux/hardirq.h>
16#include "kdb_private.h" 14#include "kdb_private.h"
17#include "../debug_core.h" 15#include "../debug_core.h"
18 16
@@ -53,9 +51,6 @@ int kdb_stub(struct kgdb_state *ks)
53 if (atomic_read(&kgdb_setting_breakpoint)) 51 if (atomic_read(&kgdb_setting_breakpoint))
54 reason = KDB_REASON_KEYBOARD; 52 reason = KDB_REASON_KEYBOARD;
55 53
56 if (in_nmi())
57 reason = KDB_REASON_NMI;
58
59 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 54 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
60 if ((bp->bp_enabled) && (bp->bp_addr == addr)) { 55 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
61 reason = KDB_REASON_BREAK; 56 reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262..4802eb5840e 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,7 +552,6 @@ int vkdb_printf(const char *fmt, va_list ap)
552{ 552{
553 int diag; 553 int diag;
554 int linecount; 554 int linecount;
555 int colcount;
556 int logging, saved_loglevel = 0; 555 int logging, saved_loglevel = 0;
557 int saved_trap_printk; 556 int saved_trap_printk;
558 int got_printf_lock = 0; 557 int got_printf_lock = 0;
@@ -585,10 +584,6 @@ int vkdb_printf(const char *fmt, va_list ap)
585 if (diag || linecount <= 1) 584 if (diag || linecount <= 1)
586 linecount = 24; 585 linecount = 24;
587 586
588 diag = kdbgetintenv("COLUMNS", &colcount);
589 if (diag || colcount <= 1)
590 colcount = 80;
591
592 diag = kdbgetintenv("LOGGING", &logging); 587 diag = kdbgetintenv("LOGGING", &logging);
593 if (diag) 588 if (diag)
594 logging = 0; 589 logging = 0;
@@ -694,8 +689,8 @@ kdb_printit:
694 if (!dbg_kdb_mode && kgdb_connected) { 689 if (!dbg_kdb_mode && kgdb_connected) {
695 gdbstub_msg_write(kdb_buffer, retlen); 690 gdbstub_msg_write(kdb_buffer, retlen);
696 } else { 691 } else {
697 if (dbg_io_ops && !dbg_io_ops->is_console) { 692 if (!dbg_io_ops->is_console) {
698 len = retlen; 693 len = strlen(kdb_buffer);
699 cp = kdb_buffer; 694 cp = kdb_buffer;
700 while (len--) { 695 while (len--) {
701 dbg_io_ops->write_char(*cp); 696 dbg_io_ops->write_char(*cp);
@@ -714,30 +709,15 @@ kdb_printit:
714 printk(KERN_INFO "%s", kdb_buffer); 709 printk(KERN_INFO "%s", kdb_buffer);
715 } 710 }
716 711
717 if (KDB_STATE(PAGER)) { 712 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
718 /* 713 kdb_nextline++;
719 * Check printed string to decide how to bump the
720 * kdb_nextline to control when the more prompt should
721 * show up.
722 */
723 int got = 0;
724 len = retlen;
725 while (len--) {
726 if (kdb_buffer[len] == '\n') {
727 kdb_nextline++;
728 got = 0;
729 } else if (kdb_buffer[len] == '\r') {
730 got = 0;
731 } else {
732 got++;
733 }
734 }
735 kdb_nextline += got / (colcount + 1);
736 }
737 714
738 /* check for having reached the LINES number of printed lines */ 715 /* check for having reached the LINES number of printed lines */
739 if (kdb_nextline >= linecount) { 716 if (kdb_nextline == linecount) {
740 char buf1[16] = ""; 717 char buf1[16] = "";
718#if defined(CONFIG_SMP)
719 char buf2[32];
720#endif
741 721
742 /* Watch out for recursion here. Any routine that calls 722 /* Watch out for recursion here. Any routine that calls
743 * kdb_printf will come back through here. And kdb_read 723 * kdb_printf will come back through here. And kdb_read
@@ -752,10 +732,18 @@ kdb_printit:
752 if (moreprompt == NULL) 732 if (moreprompt == NULL)
753 moreprompt = "more> "; 733 moreprompt = "more> ";
754 734
735#if defined(CONFIG_SMP)
736 if (strchr(moreprompt, '%')) {
737 sprintf(buf2, moreprompt, get_cpu());
738 put_cpu();
739 moreprompt = buf2;
740 }
741#endif
742
755 kdb_input_flush(); 743 kdb_input_flush();
756 c = console_drivers; 744 c = console_drivers;
757 745
758 if (dbg_io_ops && !dbg_io_ops->is_console) { 746 if (!dbg_io_ops->is_console) {
759 len = strlen(moreprompt); 747 len = strlen(moreprompt);
760 cp = moreprompt; 748 cp = moreprompt;
761 while (len--) { 749 while (len--) {
@@ -788,7 +776,7 @@ kdb_printit:
788 kdb_grepping_flag = 0; 776 kdb_grepping_flag = 0;
789 kdb_printf("\n"); 777 kdb_printf("\n");
790 } else if (buf1[0] == ' ') { 778 } else if (buf1[0] == ' ') {
791 kdb_printf("\r"); 779 kdb_printf("\n");
792 suspend_grep = 1; /* for this recursion */ 780 suspend_grep = 1; /* for this recursion */
793 } else if (buf1[0] == '\n') { 781 } else if (buf1[0] == '\n') {
794 kdb_nextline = linecount - 1; 782 kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 118527aa60e..4bca634975c 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,7 +25,6 @@
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ 25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26 26
27static int kbd_exists; 27static int kbd_exists;
28static int kbd_last_ret;
29 28
30/* 29/*
31 * Check if the keyboard controller has a keypress for us. 30 * Check if the keyboard controller has a keypress for us.
@@ -91,11 +90,8 @@ int kdb_get_kbd_char(void)
91 return -1; 90 return -1;
92 } 91 }
93 92
94 if ((scancode & 0x80) != 0) { 93 if ((scancode & 0x80) != 0)
95 if (scancode == 0x9c)
96 kbd_last_ret = 0;
97 return -1; 94 return -1;
98 }
99 95
100 scancode &= 0x7f; 96 scancode &= 0x7f;
101 97
@@ -182,82 +178,35 @@ int kdb_get_kbd_char(void)
182 return -1; /* ignore unprintables */ 178 return -1; /* ignore unprintables */
183 } 179 }
184 180
185 if (scancode == 0x1c) { 181 if ((scancode & 0x7f) == 0x1c) {
186 kbd_last_ret = 1; 182 /*
187 return 13; 183 * enter key. All done. Absorb the release scancode.
188 } 184 */
189
190 return keychar & 0xff;
191}
192EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
193
194/*
195 * Best effort cleanup of ENTER break codes on leaving KDB. Called on
196 * exiting KDB, when we know we processed an ENTER or KP ENTER scan
197 * code.
198 */
199void kdb_kbd_cleanup_state(void)
200{
201 int scancode, scanstatus;
202
203 /*
204 * Nothing to clean up, since either
205 * ENTER was never pressed, or has already
206 * gotten cleaned up.
207 */
208 if (!kbd_last_ret)
209 return;
210
211 kbd_last_ret = 0;
212 /*
213 * Enter key. Need to absorb the break code here, lest it gets
214 * leaked out if we exit KDB as the result of processing 'g'.
215 *
216 * This has several interesting implications:
217 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
218 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
219 * only get a break code at the end of the repeated
220 * sequence. This means we can't propagate the repeated key
221 * press, and must swallow it away.
222 * + Need to handle possible PS/2 mouse input.
223 * + Need to handle mashed keys.
224 */
225
226 while (1) {
227 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) 185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
228 cpu_relax(); 186 ;
229 187
230 /* 188 /*
231 * Fetch the scancode. 189 * Fetch the scancode
232 */ 190 */
233 scancode = inb(KBD_DATA_REG); 191 scancode = inb(KBD_DATA_REG);
234 scanstatus = inb(KBD_STATUS_REG); 192 scanstatus = inb(KBD_STATUS_REG);
235 193
236 /* 194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
237 * Skip mouse input. 195 scancode = inb(KBD_DATA_REG);
238 */ 196 scanstatus = inb(KBD_STATUS_REG);
239 if (scanstatus & KBD_STAT_MOUSE_OBF) 197 }
240 continue;
241 198
242 /* 199 if (scancode != 0x9c) {
243 * If we see 0xe0, this is either a break code for KP 200 /*
244 * ENTER, or a repeat make for KP ENTER. Either way, 201 * Wasn't an enter-release, why not?
245 * since the second byte is equivalent to an ENTER, 202 */
246 * skip the 0xe0 and try again. 203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
247 * 204 scancode, scanstatus);
248 * If we see 0x1c, this must be a repeat ENTER or KP 205 }
249 * ENTER (and we swallowed 0xe0 before). Try again.
250 *
251 * We can also see make and break codes for other keys
252 * mashed before or after pressing ENTER. Thus, if we
253 * see anything other than 0x9c, we have to try again.
254 *
255 * Note, if you held some key as ENTER was depressed,
256 * that break code would get leaked out.
257 */
258 if (scancode != 0x9c)
259 continue;
260 206
261 return; 207 return 13;
262 } 208 }
209
210 return keychar & 0xff;
263} 211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f..63786e71a3c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,14 +14,12 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/kmsg_dump.h>
18#include <linux/reboot.h> 17#include <linux/reboot.h>
19#include <linux/sched.h> 18#include <linux/sched.h>
20#include <linux/sysrq.h> 19#include <linux/sysrq.h>
21#include <linux/smp.h> 20#include <linux/smp.h>
22#include <linux/utsname.h> 21#include <linux/utsname.h>
23#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
24#include <linux/atomic.h>
25#include <linux/module.h> 23#include <linux/module.h>
26#include <linux/mm.h> 24#include <linux/mm.h>
27#include <linux/init.h> 25#include <linux/init.h>
@@ -140,10 +138,11 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
140static char *__env[] = { 138static char *__env[] = {
141#if defined(CONFIG_SMP) 139#if defined(CONFIG_SMP)
142 "PROMPT=[%d]kdb> ", 140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
143#else 142#else
144 "PROMPT=kdb> ", 143 "PROMPT=kdb> ",
145#endif
146 "MOREPROMPT=more> ", 144 "MOREPROMPT=more> ",
145#endif
147 "RADIX=16", 146 "RADIX=16",
148 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
@@ -1236,6 +1235,18 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1236 *cmdbuf = '\0'; 1235 *cmdbuf = '\0';
1237 *(cmd_hist[cmd_head]) = '\0'; 1236 *(cmd_hist[cmd_head]) = '\0';
1238 1237
1238 if (KDB_FLAG(ONLY_DO_DUMP)) {
1239 /* kdb is off but a catastrophic error requires a dump.
1240 * Take the dump and reboot.
1241 * Turn on logging so the kdb output appears in the log
1242 * buffer in the dump.
1243 */
1244 const char *setargs[] = { "set", "LOGGING", "1" };
1245 kdb_set(2, setargs);
1246 kdb_reboot(0, NULL);
1247 /*NOTREACHED*/
1248 }
1249
1239do_full_getstr: 1250do_full_getstr:
1240#if defined(CONFIG_SMP) 1251#if defined(CONFIG_SMP)
1241 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), 1252 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -1389,9 +1400,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1389 if (KDB_STATE(DOING_SS)) 1400 if (KDB_STATE(DOING_SS))
1390 KDB_STATE_CLEAR(SSBPT); 1401 KDB_STATE_CLEAR(SSBPT);
1391 1402
1392 /* Clean up any keyboard devices before leaving */
1393 kdb_kbd_cleanup_state();
1394
1395 return result; 1403 return result;
1396} 1404}
1397 1405
@@ -1974,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
1974 kdb_printf("%-20s%8u 0x%p ", mod->name, 1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1975 mod->core_size, (void *)mod); 1983 mod->core_size, (void *)mod);
1976#ifdef CONFIG_MODULE_UNLOAD 1984#ifdef CONFIG_MODULE_UNLOAD
1977 kdb_printf("%4ld ", module_refcount(mod)); 1985 kdb_printf("%4d ", module_refcount(mod));
1978#endif 1986#endif
1979 if (mod->state == MODULE_STATE_GOING) 1987 if (mod->state == MODULE_STATE_GOING)
1980 kdb_printf(" (Unloading)"); 1988 kdb_printf(" (Unloading)");
@@ -2029,15 +2037,8 @@ static int kdb_env(int argc, const char **argv)
2029 */ 2037 */
2030static int kdb_dmesg(int argc, const char **argv) 2038static int kdb_dmesg(int argc, const char **argv)
2031{ 2039{
2032 int diag; 2040 char *syslog_data[4], *start, *end, c = '\0', *p;
2033 int logging; 2041 int diag, logging, logsize, lines = 0, adjust = 0, n;
2034 int lines = 0;
2035 int adjust = 0;
2036 int n = 0;
2037 int skip = 0;
2038 struct kmsg_dumper dumper = { .active = 1 };
2039 size_t len;
2040 char buf[201];
2041 2042
2042 if (argc > 2) 2043 if (argc > 2)
2043 return KDB_ARGCOUNT; 2044 return KDB_ARGCOUNT;
@@ -2060,10 +2061,22 @@ static int kdb_dmesg(int argc, const char **argv)
2060 kdb_set(2, setargs); 2061 kdb_set(2, setargs);
2061 } 2062 }
2062 2063
2063 kmsg_dump_rewind_nolock(&dumper); 2064 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
2064 while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) 2065 * logical start, end+1. */
2065 n++; 2066 kdb_syslog_data(syslog_data);
2066 2067 if (syslog_data[2] == syslog_data[3])
2068 return 0;
2069 logsize = syslog_data[1] - syslog_data[0];
2070 start = syslog_data[2];
2071 end = syslog_data[3];
2072#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2073 for (n = 0, p = start; p < end; ++p) {
2074 c = *KDB_WRAP(p);
2075 if (c == '\n')
2076 ++n;
2077 }
2078 if (c != '\n')
2079 ++n;
2067 if (lines < 0) { 2080 if (lines < 0) {
2068 if (adjust >= n) 2081 if (adjust >= n)
2069 kdb_printf("buffer only contains %d lines, nothing " 2082 kdb_printf("buffer only contains %d lines, nothing "
@@ -2071,11 +2084,21 @@ static int kdb_dmesg(int argc, const char **argv)
2071 else if (adjust - lines >= n) 2084 else if (adjust - lines >= n)
2072 kdb_printf("buffer only contains %d lines, last %d " 2085 kdb_printf("buffer only contains %d lines, last %d "
2073 "lines printed\n", n, n - adjust); 2086 "lines printed\n", n, n - adjust);
2074 skip = adjust; 2087 if (adjust) {
2075 lines = abs(lines); 2088 for (; start < end && adjust; ++start) {
2089 if (*KDB_WRAP(start) == '\n')
2090 --adjust;
2091 }
2092 if (start < end)
2093 ++start;
2094 }
2095 for (p = start; p < end && lines; ++p) {
2096 if (*KDB_WRAP(p) == '\n')
2097 ++lines;
2098 }
2099 end = p;
2076 } else if (lines > 0) { 2100 } else if (lines > 0) {
2077 skip = n - lines - adjust; 2101 int skip = n - (adjust + lines);
2078 lines = abs(lines);
2079 if (adjust >= n) { 2102 if (adjust >= n) {
2080 kdb_printf("buffer only contains %d lines, " 2103 kdb_printf("buffer only contains %d lines, "
2081 "nothing printed\n", n); 2104 "nothing printed\n", n);
@@ -2086,56 +2109,39 @@ static int kdb_dmesg(int argc, const char **argv)
2086 kdb_printf("buffer only contains %d lines, first " 2109 kdb_printf("buffer only contains %d lines, first "
2087 "%d lines printed\n", n, lines); 2110 "%d lines printed\n", n, lines);
2088 } 2111 }
2089 } else { 2112 for (; start < end && skip; ++start) {
2090 lines = n; 2113 if (*KDB_WRAP(start) == '\n')
2091 } 2114 --skip;
2092
2093 if (skip >= n || skip < 0)
2094 return 0;
2095
2096 kmsg_dump_rewind_nolock(&dumper);
2097 while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
2098 if (skip) {
2099 skip--;
2100 continue;
2101 } 2115 }
2102 if (!lines--) 2116 for (p = start; p < end && lines; ++p) {
2103 break; 2117 if (*KDB_WRAP(p) == '\n')
2118 --lines;
2119 }
2120 end = p;
2121 }
2122 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2123 c = '\n';
2124 while (start != end) {
2125 char buf[201];
2126 p = buf;
2104 if (KDB_FLAG(CMD_INTERRUPT)) 2127 if (KDB_FLAG(CMD_INTERRUPT))
2105 return 0; 2128 return 0;
2106 2129 while (start < end && (c = *KDB_WRAP(start)) &&
2107 kdb_printf("%.*s\n", (int)len - 1, buf); 2130 (p - buf) < sizeof(buf)-1) {
2131 ++start;
2132 *p++ = c;
2133 if (c == '\n')
2134 break;
2135 }
2136 *p = '\0';
2137 kdb_printf("%s", buf);
2108 } 2138 }
2139 if (c != '\n')
2140 kdb_printf("\n");
2109 2141
2110 return 0; 2142 return 0;
2111} 2143}
2112#endif /* CONFIG_PRINTK */ 2144#endif /* CONFIG_PRINTK */
2113
2114/* Make sure we balance enable/disable calls, must disable first. */
2115static atomic_t kdb_nmi_disabled;
2116
2117static int kdb_disable_nmi(int argc, const char *argv[])
2118{
2119 if (atomic_read(&kdb_nmi_disabled))
2120 return 0;
2121 atomic_set(&kdb_nmi_disabled, 1);
2122 arch_kgdb_ops.enable_nmi(0);
2123 return 0;
2124}
2125
2126static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
2127{
2128 if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
2129 return -EINVAL;
2130 arch_kgdb_ops.enable_nmi(1);
2131 return 0;
2132}
2133
2134static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
2135 .set = kdb_param_enable_nmi,
2136};
2137module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
2138
2139/* 2145/*
2140 * kdb_cpu - This function implements the 'cpu' command. 2146 * kdb_cpu - This function implements the 'cpu' command.
2141 * cpu [<cpunum>] 2147 * cpu [<cpunum>]
@@ -2880,10 +2886,6 @@ static void __init kdb_inittab(void)
2880 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2886 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2881 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2887 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2882#endif 2888#endif
2883 if (arch_kgdb_ops.enable_nmi) {
2884 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
2885 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
2886 }
2887 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2889 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2888 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2890 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2889 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2891 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a2584..e381d105b40 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,6 +205,7 @@ extern char kdb_grep_string[];
205extern int kdb_grep_leading; 205extern int kdb_grep_leading;
206extern int kdb_grep_trailing; 206extern int kdb_grep_trailing;
207extern char *kdb_cmds[]; 207extern char *kdb_cmds[];
208extern void kdb_syslog_data(char *syslog_data[]);
208extern unsigned long kdb_task_state_string(const char *); 209extern unsigned long kdb_task_state_string(const char *);
209extern char kdb_task_state_char (const struct task_struct *); 210extern char kdb_task_state_char (const struct task_struct *);
210extern unsigned long kdb_task_state(const struct task_struct *p, 211extern unsigned long kdb_task_state(const struct task_struct *p,
@@ -245,13 +246,6 @@ extern void debug_kusage(void);
245 246
246extern void kdb_set_current_task(struct task_struct *); 247extern void kdb_set_current_task(struct task_struct *);
247extern struct task_struct *kdb_current_task; 248extern struct task_struct *kdb_current_task;
248
249#ifdef CONFIG_KDB_KEYBOARD
250extern void kdb_kbd_cleanup_state(void);
251#else /* ! CONFIG_KDB_KEYBOARD */
252#define kdb_kbd_cleanup_state()
253#endif /* ! CONFIG_KDB_KEYBOARD */
254
255#ifdef CONFIG_MODULES 249#ifdef CONFIG_MODULES
256extern struct list_head *kdb_modules; 250extern struct list_head *kdb_modules;
257#endif /* CONFIG_MODULES */ 251#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4c..5532dd37aa8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
384 if (!pfn_valid(pfn)) 384 if (!pfn_valid(pfn))
385 return 1; 385 return 1;
386 page = pfn_to_page(pfn); 386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page); 387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); 388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr); 389 kunmap_atomic(vaddr, KM_KDB);
390 390
391 return 0; 391 return 0;
392} 392}
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (is_idle_task(p)) { 639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/dma.c b/kernel/dma.c
index 6c6262f86c1..f903189c530 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
9 * [It also happened to remove the sizeof(char *) == sizeof(int) 9 * [It also happened to remove the sizeof(char *) == sizeof(int)
10 * assumption introduced because of those /proc/dma patches. -- Hennus] 10 * assumption introduced because of those /proc/dma patches. -- Hennus]
11 */ 11 */
12#include <linux/export.h> 12#include <linux/module.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
@@ -18,6 +18,7 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h>
21 22
22 23
23 24
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2..89e5e8aa4c3 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,8 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o
6
7obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
8obj-$(CONFIG_UPROBES) += uprobes.o
9
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
deleted file mode 100644
index c77206184b8..00000000000
--- a/kernel/events/callchain.c
+++ /dev/null
@@ -1,206 +0,0 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118exit:
119 mutex_unlock(&callchain_mutex);
120
121 return err;
122}
123
124void put_callchain_buffers(void)
125{
126 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
127 release_callchain_buffers();
128 mutex_unlock(&callchain_mutex);
129 }
130}
131
132static struct perf_callchain_entry *get_callchain_entry(int *rctx)
133{
134 int cpu;
135 struct callchain_cpus_entries *entries;
136
137 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
138 if (*rctx == -1)
139 return NULL;
140
141 entries = rcu_dereference(callchain_cpus_entries);
142 if (!entries)
143 return NULL;
144
145 cpu = smp_processor_id();
146
147 return &entries->cpu_entries[cpu][*rctx];
148}
149
150static void
151put_callchain_entry(int rctx)
152{
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154}
155
156struct perf_callchain_entry *
157perf_callchain(struct perf_event *event, struct pt_regs *regs)
158{
159 int rctx;
160 struct perf_callchain_entry *entry;
161
162 int kernel = !event->attr.exclude_callchain_kernel;
163 int user = !event->attr.exclude_callchain_user;
164
165 if (!kernel && !user)
166 return NULL;
167
168 entry = get_callchain_entry(&rctx);
169 if (rctx == -1)
170 return NULL;
171
172 if (!entry)
173 goto exit_put;
174
175 entry->nr = 0;
176
177 if (kernel && !user_mode(regs)) {
178 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
179 perf_callchain_kernel(entry, regs);
180 }
181
182 if (user) {
183 if (!user_mode(regs)) {
184 if (current->mm)
185 regs = task_pt_regs(current);
186 else
187 regs = NULL;
188 }
189
190 if (regs) {
191 /*
192 * Disallow cross-task user callchains.
193 */
194 if (event->ctx->task && event->ctx->task != current)
195 goto exit_put;
196
197 perf_callchain_store(entry, PERF_CONTEXT_USER);
198 perf_callchain_user(entry, regs);
199 }
200 }
201
202exit_put:
203 put_callchain_entry(rctx);
204
205 return entry;
206}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 301079d06f2..0f857782d06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -25,7 +25,6 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h> 27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
30#include <linux/hardirq.h> 29#include <linux/hardirq.h>
31#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -36,7 +35,6 @@
36#include <linux/perf_event.h> 35#include <linux/perf_event.h>
37#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h>
40 38
41#include "internal.h" 39#include "internal.h"
42 40
@@ -119,13 +117,6 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
119 PERF_FLAG_FD_OUTPUT |\ 117 PERF_FLAG_FD_OUTPUT |\
120 PERF_FLAG_PID_CGROUP) 118 PERF_FLAG_PID_CGROUP)
121 119
122/*
123 * branch priv levels that need permission checks
124 */
125#define PERF_SAMPLE_BRANCH_PERM_PLM \
126 (PERF_SAMPLE_BRANCH_KERNEL |\
127 PERF_SAMPLE_BRANCH_HV)
128
129enum event_type_t { 120enum event_type_t {
130 EVENT_FLEXIBLE = 0x1, 121 EVENT_FLEXIBLE = 0x1,
131 EVENT_PINNED = 0x2, 122 EVENT_PINNED = 0x2,
@@ -136,9 +127,8 @@ enum event_type_t {
136 * perf_sched_events : >0 events exist 127 * perf_sched_events : >0 events exist
137 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 128 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
138 */ 129 */
139struct static_key_deferred perf_sched_events __read_mostly; 130struct jump_label_key perf_sched_events __read_mostly;
140static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 131static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
141static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
142 132
143static atomic_t nr_mmap_events __read_mostly; 133static atomic_t nr_mmap_events __read_mostly;
144static atomic_t nr_comm_events __read_mostly; 134static atomic_t nr_comm_events __read_mostly;
@@ -194,9 +184,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
194static void update_context_time(struct perf_event_context *ctx); 184static void update_context_time(struct perf_event_context *ctx);
195static u64 perf_event_time(struct perf_event *event); 185static u64 perf_event_time(struct perf_event *event);
196 186
197static void ring_buffer_attach(struct perf_event *event,
198 struct ring_buffer *rb);
199
200void __weak perf_event_print_debug(void) { } 187void __weak perf_event_print_debug(void) { }
201 188
202extern __weak const char *perf_pmu_name(void) 189extern __weak const char *perf_pmu_name(void)
@@ -254,9 +241,9 @@ perf_cgroup_match(struct perf_event *event)
254 return !event->cgrp || event->cgrp == cpuctx->cgrp; 241 return !event->cgrp || event->cgrp == cpuctx->cgrp;
255} 242}
256 243
257static inline bool perf_tryget_cgroup(struct perf_event *event) 244static inline void perf_get_cgroup(struct perf_event *event)
258{ 245{
259 return css_tryget(&event->cgrp->css); 246 css_get(&event->cgrp->css);
260} 247}
261 248
262static inline void perf_put_cgroup(struct perf_event *event) 249static inline void perf_put_cgroup(struct perf_event *event)
@@ -372,8 +359,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 359
373 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
374 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
375 if (cpuctx->unique_pmu != pmu)
376 continue; /* ensure we process each cpuctx once */
377 362
378 /* 363 /*
379 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
@@ -397,10 +382,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
397 382
398 if (mode & PERF_CGROUP_SWIN) { 383 if (mode & PERF_CGROUP_SWIN) {
399 WARN_ON_ONCE(cpuctx->cgrp); 384 WARN_ON_ONCE(cpuctx->cgrp);
400 /* 385 /* set cgrp before ctxsw in to
401 * set cgrp before ctxsw in to allow 386 * allow event_filter_match() to not
402 * event_filter_match() to not have to pass 387 * have to pass task around
403 * task around
404 */ 388 */
405 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
406 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -471,13 +455,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
471{ 455{
472 struct perf_cgroup *cgrp; 456 struct perf_cgroup *cgrp;
473 struct cgroup_subsys_state *css; 457 struct cgroup_subsys_state *css;
474 struct fd f = fdget(fd); 458 struct file *file;
475 int ret = 0; 459 int ret = 0, fput_needed;
476 460
477 if (!f.file) 461 file = fget_light(fd, &fput_needed);
462 if (!file)
478 return -EBADF; 463 return -EBADF;
479 464
480 css = cgroup_css_from_dir(f.file, perf_subsys_id); 465 css = cgroup_css_from_dir(file, perf_subsys_id);
481 if (IS_ERR(css)) { 466 if (IS_ERR(css)) {
482 ret = PTR_ERR(css); 467 ret = PTR_ERR(css);
483 goto out; 468 goto out;
@@ -487,11 +472,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
487 event->cgrp = cgrp; 472 event->cgrp = cgrp;
488 473
489 /* must be done before we fput() the file */ 474 /* must be done before we fput() the file */
490 if (!perf_tryget_cgroup(event)) { 475 perf_get_cgroup(event);
491 event->cgrp = NULL;
492 ret = -ENOENT;
493 goto out;
494 }
495 476
496 /* 477 /*
497 * all events in a group must monitor 478 * all events in a group must monitor
@@ -503,7 +484,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
503 ret = -EINVAL; 484 ret = -EINVAL;
504 } 485 }
505out: 486out:
506 fdput(f); 487 fput_light(file, fput_needed);
507 return ret; 488 return ret;
508} 489}
509 490
@@ -830,7 +811,7 @@ static void update_event_times(struct perf_event *event)
830 * here. 811 * here.
831 */ 812 */
832 if (is_cgroup_event(event)) 813 if (is_cgroup_event(event))
833 run_end = perf_cgroup_event_time(event); 814 run_end = perf_event_time(event);
834 else if (ctx->is_active) 815 else if (ctx->is_active)
835 run_end = ctx->time; 816 run_end = ctx->time;
836 else 817 else
@@ -896,9 +877,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
896 if (is_cgroup_event(event)) 877 if (is_cgroup_event(event))
897 ctx->nr_cgroups++; 878 ctx->nr_cgroups++;
898 879
899 if (has_branch_stack(event))
900 ctx->nr_branch_stack++;
901
902 list_add_rcu(&event->event_entry, &ctx->event_list); 880 list_add_rcu(&event->event_entry, &ctx->event_list);
903 if (!ctx->nr_events) 881 if (!ctx->nr_events)
904 perf_pmu_rotate_start(ctx->pmu); 882 perf_pmu_rotate_start(ctx->pmu);
@@ -1038,9 +1016,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1038 cpuctx->cgrp = NULL; 1016 cpuctx->cgrp = NULL;
1039 } 1017 }
1040 1018
1041 if (has_branch_stack(event))
1042 ctx->nr_branch_stack--;
1043
1044 ctx->nr_events--; 1019 ctx->nr_events--;
1045 if (event->attr.inherit_stat) 1020 if (event->attr.inherit_stat)
1046 ctx->nr_stat--; 1021 ctx->nr_stat--;
@@ -1151,8 +1126,6 @@ event_sched_out(struct perf_event *event,
1151 if (!is_software_event(event)) 1126 if (!is_software_event(event))
1152 cpuctx->active_oncpu--; 1127 cpuctx->active_oncpu--;
1153 ctx->nr_active--; 1128 ctx->nr_active--;
1154 if (event->attr.freq && event->attr.sample_freq)
1155 ctx->nr_freq--;
1156 if (event->attr.exclusive || !cpuctx->active_oncpu) 1129 if (event->attr.exclusive || !cpuctx->active_oncpu)
1157 cpuctx->exclusive = 0; 1130 cpuctx->exclusive = 0;
1158} 1131}
@@ -1256,7 +1229,7 @@ retry:
1256/* 1229/*
1257 * Cross CPU call to disable a performance event 1230 * Cross CPU call to disable a performance event
1258 */ 1231 */
1259int __perf_event_disable(void *info) 1232static int __perf_event_disable(void *info)
1260{ 1233{
1261 struct perf_event *event = info; 1234 struct perf_event *event = info;
1262 struct perf_event_context *ctx = event->ctx; 1235 struct perf_event_context *ctx = event->ctx;
@@ -1348,7 +1321,6 @@ retry:
1348 } 1321 }
1349 raw_spin_unlock_irq(&ctx->lock); 1322 raw_spin_unlock_irq(&ctx->lock);
1350} 1323}
1351EXPORT_SYMBOL_GPL(perf_event_disable);
1352 1324
1353static void perf_set_shadow_time(struct perf_event *event, 1325static void perf_set_shadow_time(struct perf_event *event,
1354 struct perf_event_context *ctx, 1326 struct perf_event_context *ctx,
@@ -1430,8 +1402,6 @@ event_sched_in(struct perf_event *event,
1430 if (!is_software_event(event)) 1402 if (!is_software_event(event))
1431 cpuctx->active_oncpu++; 1403 cpuctx->active_oncpu++;
1432 ctx->nr_active++; 1404 ctx->nr_active++;
1433 if (event->attr.freq && event->attr.sample_freq)
1434 ctx->nr_freq++;
1435 1405
1436 if (event->attr.exclusive) 1406 if (event->attr.exclusive)
1437 cpuctx->exclusive = 1; 1407 cpuctx->exclusive = 1;
@@ -1648,8 +1618,6 @@ perf_install_in_context(struct perf_event_context *ctx,
1648 lockdep_assert_held(&ctx->mutex); 1618 lockdep_assert_held(&ctx->mutex);
1649 1619
1650 event->ctx = ctx; 1620 event->ctx = ctx;
1651 if (event->cpu != -1)
1652 event->cpu = cpu;
1653 1621
1654 if (!task) { 1622 if (!task) {
1655 /* 1623 /*
@@ -1690,7 +1658,8 @@ retry:
1690 * Note: this works for group members as well as group leaders 1658 * Note: this works for group members as well as group leaders
1691 * since the non-leader members' sibling_lists will be empty. 1659 * since the non-leader members' sibling_lists will be empty.
1692 */ 1660 */
1693static void __perf_event_mark_enabled(struct perf_event *event) 1661static void __perf_event_mark_enabled(struct perf_event *event,
1662 struct perf_event_context *ctx)
1694{ 1663{
1695 struct perf_event *sub; 1664 struct perf_event *sub;
1696 u64 tstamp = perf_event_time(event); 1665 u64 tstamp = perf_event_time(event);
@@ -1728,7 +1697,7 @@ static int __perf_event_enable(void *info)
1728 */ 1697 */
1729 perf_cgroup_set_timestamp(current, ctx); 1698 perf_cgroup_set_timestamp(current, ctx);
1730 1699
1731 __perf_event_mark_enabled(event); 1700 __perf_event_mark_enabled(event, ctx);
1732 1701
1733 if (!event_filter_match(event)) { 1702 if (!event_filter_match(event)) {
1734 if (is_cgroup_event(event)) 1703 if (is_cgroup_event(event))
@@ -1809,7 +1778,7 @@ void perf_event_enable(struct perf_event *event)
1809 1778
1810retry: 1779retry:
1811 if (!ctx->is_active) { 1780 if (!ctx->is_active) {
1812 __perf_event_mark_enabled(event); 1781 __perf_event_mark_enabled(event, ctx);
1813 goto out; 1782 goto out;
1814 } 1783 }
1815 1784
@@ -1836,7 +1805,6 @@ retry:
1836out: 1805out:
1837 raw_spin_unlock_irq(&ctx->lock); 1806 raw_spin_unlock_irq(&ctx->lock);
1838} 1807}
1839EXPORT_SYMBOL_GPL(perf_event_enable);
1840 1808
1841int perf_event_refresh(struct perf_event *event, int refresh) 1809int perf_event_refresh(struct perf_event *event, int refresh)
1842{ 1810{
@@ -2202,10 +2170,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2202 */ 2170 */
2203 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2171 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2204 2172
2205 if (ctx->nr_events) 2173 perf_event_sched_in(cpuctx, ctx, task);
2206 cpuctx->task_ctx = ctx;
2207 2174
2208 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); 2175 cpuctx->task_ctx = ctx;
2209 2176
2210 perf_pmu_enable(ctx->pmu); 2177 perf_pmu_enable(ctx->pmu);
2211 perf_ctx_unlock(cpuctx, ctx); 2178 perf_ctx_unlock(cpuctx, ctx);
@@ -2218,66 +2185,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2218} 2185}
2219 2186
2220/* 2187/*
2221 * When sampling the branck stack in system-wide, it may be necessary
2222 * to flush the stack on context switch. This happens when the branch
2223 * stack does not tag its entries with the pid of the current task.
2224 * Otherwise it becomes impossible to associate a branch entry with a
2225 * task. This ambiguity is more likely to appear when the branch stack
2226 * supports priv level filtering and the user sets it to monitor only
2227 * at the user level (which could be a useful measurement in system-wide
2228 * mode). In that case, the risk is high of having a branch stack with
2229 * branch from multiple tasks. Flushing may mean dropping the existing
2230 * entries or stashing them somewhere in the PMU specific code layer.
2231 *
2232 * This function provides the context switch callback to the lower code
2233 * layer. It is invoked ONLY when there is at least one system-wide context
2234 * with at least one active event using taken branch sampling.
2235 */
2236static void perf_branch_stack_sched_in(struct task_struct *prev,
2237 struct task_struct *task)
2238{
2239 struct perf_cpu_context *cpuctx;
2240 struct pmu *pmu;
2241 unsigned long flags;
2242
2243 /* no need to flush branch stack if not changing task */
2244 if (prev == task)
2245 return;
2246
2247 local_irq_save(flags);
2248
2249 rcu_read_lock();
2250
2251 list_for_each_entry_rcu(pmu, &pmus, entry) {
2252 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2253
2254 /*
2255 * check if the context has at least one
2256 * event using PERF_SAMPLE_BRANCH_STACK
2257 */
2258 if (cpuctx->ctx.nr_branch_stack > 0
2259 && pmu->flush_branch_stack) {
2260
2261 pmu = cpuctx->ctx.pmu;
2262
2263 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2264
2265 perf_pmu_disable(pmu);
2266
2267 pmu->flush_branch_stack();
2268
2269 perf_pmu_enable(pmu);
2270
2271 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2272 }
2273 }
2274
2275 rcu_read_unlock();
2276
2277 local_irq_restore(flags);
2278}
2279
2280/*
2281 * Called from scheduler to add the events of the current task 2188 * Called from scheduler to add the events of the current task
2282 * with interrupts disabled. 2189 * with interrupts disabled.
2283 * 2190 *
@@ -2308,10 +2215,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2308 */ 2215 */
2309 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2216 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2310 perf_cgroup_sched_in(prev, task); 2217 perf_cgroup_sched_in(prev, task);
2311
2312 /* check for system-wide branch_stack events */
2313 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2314 perf_branch_stack_sched_in(prev, task);
2315} 2218}
2316 2219
2317static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2220static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2387,10 +2290,7 @@ do { \
2387 return div64_u64(dividend, divisor); 2290 return div64_u64(dividend, divisor);
2388} 2291}
2389 2292
2390static DEFINE_PER_CPU(int, perf_throttled_count); 2293static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
2391static DEFINE_PER_CPU(u64, perf_throttled_seq);
2392
2393static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2394{ 2294{
2395 struct hw_perf_event *hwc = &event->hw; 2295 struct hw_perf_event *hwc = &event->hw;
2396 s64 period, sample_period; 2296 s64 period, sample_period;
@@ -2409,40 +2309,19 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
2409 hwc->sample_period = sample_period; 2309 hwc->sample_period = sample_period;
2410 2310
2411 if (local64_read(&hwc->period_left) > 8*sample_period) { 2311 if (local64_read(&hwc->period_left) > 8*sample_period) {
2412 if (disable) 2312 event->pmu->stop(event, PERF_EF_UPDATE);
2413 event->pmu->stop(event, PERF_EF_UPDATE);
2414
2415 local64_set(&hwc->period_left, 0); 2313 local64_set(&hwc->period_left, 0);
2416 2314 event->pmu->start(event, PERF_EF_RELOAD);
2417 if (disable)
2418 event->pmu->start(event, PERF_EF_RELOAD);
2419 } 2315 }
2420} 2316}
2421 2317
2422/* 2318static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2423 * combine freq adjustment with unthrottling to avoid two passes over the
2424 * events. At the same time, make sure, having freq events does not change
2425 * the rate of unthrottling as that would introduce bias.
2426 */
2427static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2428 int needs_unthr)
2429{ 2319{
2430 struct perf_event *event; 2320 struct perf_event *event;
2431 struct hw_perf_event *hwc; 2321 struct hw_perf_event *hwc;
2432 u64 now, period = TICK_NSEC; 2322 u64 interrupts, now;
2433 s64 delta; 2323 s64 delta;
2434 2324
2435 /*
2436 * only need to iterate over all events iff:
2437 * - context have events in frequency mode (needs freq adjust)
2438 * - there are events to unthrottle on this cpu
2439 */
2440 if (!(ctx->nr_freq || needs_unthr))
2441 return;
2442
2443 raw_spin_lock(&ctx->lock);
2444 perf_pmu_disable(ctx->pmu);
2445
2446 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2325 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2447 if (event->state != PERF_EVENT_STATE_ACTIVE) 2326 if (event->state != PERF_EVENT_STATE_ACTIVE)
2448 continue; 2327 continue;
@@ -2452,8 +2331,13 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2452 2331
2453 hwc = &event->hw; 2332 hwc = &event->hw;
2454 2333
2455 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { 2334 interrupts = hwc->interrupts;
2456 hwc->interrupts = 0; 2335 hwc->interrupts = 0;
2336
2337 /*
2338 * unthrottle events on the tick
2339 */
2340 if (interrupts == MAX_INTERRUPTS) {
2457 perf_log_throttle(event, 1); 2341 perf_log_throttle(event, 1);
2458 event->pmu->start(event, 0); 2342 event->pmu->start(event, 0);
2459 } 2343 }
@@ -2461,30 +2345,14 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2461 if (!event->attr.freq || !event->attr.sample_freq) 2345 if (!event->attr.freq || !event->attr.sample_freq)
2462 continue; 2346 continue;
2463 2347
2464 /* 2348 event->pmu->read(event);
2465 * stop the event and update event->count
2466 */
2467 event->pmu->stop(event, PERF_EF_UPDATE);
2468
2469 now = local64_read(&event->count); 2349 now = local64_read(&event->count);
2470 delta = now - hwc->freq_count_stamp; 2350 delta = now - hwc->freq_count_stamp;
2471 hwc->freq_count_stamp = now; 2351 hwc->freq_count_stamp = now;
2472 2352
2473 /*
2474 * restart the event
2475 * reload only if value has changed
2476 * we have stopped the event so tell that
2477 * to perf_adjust_period() to avoid stopping it
2478 * twice.
2479 */
2480 if (delta > 0) 2353 if (delta > 0)
2481 perf_adjust_period(event, period, delta, false); 2354 perf_adjust_period(event, period, delta);
2482
2483 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2484 } 2355 }
2485
2486 perf_pmu_enable(ctx->pmu);
2487 raw_spin_unlock(&ctx->lock);
2488} 2356}
2489 2357
2490/* 2358/*
@@ -2507,6 +2375,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2507 */ 2375 */
2508static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2376static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2509{ 2377{
2378 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2510 struct perf_event_context *ctx = NULL; 2379 struct perf_event_context *ctx = NULL;
2511 int rotate = 0, remove = 1; 2380 int rotate = 0, remove = 1;
2512 2381
@@ -2523,11 +2392,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2523 rotate = 1; 2392 rotate = 1;
2524 } 2393 }
2525 2394
2526 if (!rotate)
2527 goto done;
2528
2529 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2395 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2530 perf_pmu_disable(cpuctx->ctx.pmu); 2396 perf_pmu_disable(cpuctx->ctx.pmu);
2397 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2398 if (ctx)
2399 perf_ctx_adjust_freq(ctx, interval);
2400
2401 if (!rotate)
2402 goto done;
2531 2403
2532 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2404 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2533 if (ctx) 2405 if (ctx)
@@ -2539,33 +2411,22 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2539 2411
2540 perf_event_sched_in(cpuctx, ctx, current); 2412 perf_event_sched_in(cpuctx, ctx, current);
2541 2413
2542 perf_pmu_enable(cpuctx->ctx.pmu);
2543 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2544done: 2414done:
2545 if (remove) 2415 if (remove)
2546 list_del_init(&cpuctx->rotation_list); 2416 list_del_init(&cpuctx->rotation_list);
2417
2418 perf_pmu_enable(cpuctx->ctx.pmu);
2419 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2547} 2420}
2548 2421
2549void perf_event_task_tick(void) 2422void perf_event_task_tick(void)
2550{ 2423{
2551 struct list_head *head = &__get_cpu_var(rotation_list); 2424 struct list_head *head = &__get_cpu_var(rotation_list);
2552 struct perf_cpu_context *cpuctx, *tmp; 2425 struct perf_cpu_context *cpuctx, *tmp;
2553 struct perf_event_context *ctx;
2554 int throttled;
2555 2426
2556 WARN_ON(!irqs_disabled()); 2427 WARN_ON(!irqs_disabled());
2557 2428
2558 __this_cpu_inc(perf_throttled_seq);
2559 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2560
2561 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 2429 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2562 ctx = &cpuctx->ctx;
2563 perf_adjust_freq_unthr_context(ctx, throttled);
2564
2565 ctx = cpuctx->task_ctx;
2566 if (ctx)
2567 perf_adjust_freq_unthr_context(ctx, throttled);
2568
2569 if (cpuctx->jiffies_interval == 1 || 2430 if (cpuctx->jiffies_interval == 1 ||
2570 !(jiffies % cpuctx->jiffies_interval)) 2431 !(jiffies % cpuctx->jiffies_interval))
2571 perf_rotate_context(cpuctx); 2432 perf_rotate_context(cpuctx);
@@ -2582,7 +2443,7 @@ static int event_enable_on_exec(struct perf_event *event,
2582 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2443 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2583 return 0; 2444 return 0;
2584 2445
2585 __perf_event_mark_enabled(event); 2446 __perf_event_mark_enabled(event, ctx);
2586 2447
2587 return 1; 2448 return 1;
2588} 2449}
@@ -2614,7 +2475,13 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2614 raw_spin_lock(&ctx->lock); 2475 raw_spin_lock(&ctx->lock);
2615 task_ctx_sched_out(ctx); 2476 task_ctx_sched_out(ctx);
2616 2477
2617 list_for_each_entry(event, &ctx->event_list, event_entry) { 2478 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2479 ret = event_enable_on_exec(event, ctx);
2480 if (ret)
2481 enabled = 1;
2482 }
2483
2484 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2618 ret = event_enable_on_exec(event, ctx); 2485 ret = event_enable_on_exec(event, ctx);
2619 if (ret) 2486 if (ret)
2620 enabled = 1; 2487 enabled = 1;
@@ -2702,6 +2569,215 @@ static u64 perf_event_read(struct perf_event *event)
2702} 2569}
2703 2570
2704/* 2571/*
2572 * Callchain support
2573 */
2574
2575struct callchain_cpus_entries {
2576 struct rcu_head rcu_head;
2577 struct perf_callchain_entry *cpu_entries[0];
2578};
2579
2580static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2581static atomic_t nr_callchain_events;
2582static DEFINE_MUTEX(callchain_mutex);
2583struct callchain_cpus_entries *callchain_cpus_entries;
2584
2585
2586__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2587 struct pt_regs *regs)
2588{
2589}
2590
2591__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2592 struct pt_regs *regs)
2593{
2594}
2595
2596static void release_callchain_buffers_rcu(struct rcu_head *head)
2597{
2598 struct callchain_cpus_entries *entries;
2599 int cpu;
2600
2601 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2602
2603 for_each_possible_cpu(cpu)
2604 kfree(entries->cpu_entries[cpu]);
2605
2606 kfree(entries);
2607}
2608
2609static void release_callchain_buffers(void)
2610{
2611 struct callchain_cpus_entries *entries;
2612
2613 entries = callchain_cpus_entries;
2614 rcu_assign_pointer(callchain_cpus_entries, NULL);
2615 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2616}
2617
2618static int alloc_callchain_buffers(void)
2619{
2620 int cpu;
2621 int size;
2622 struct callchain_cpus_entries *entries;
2623
2624 /*
2625 * We can't use the percpu allocation API for data that can be
2626 * accessed from NMI. Use a temporary manual per cpu allocation
2627 * until that gets sorted out.
2628 */
2629 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2630
2631 entries = kzalloc(size, GFP_KERNEL);
2632 if (!entries)
2633 return -ENOMEM;
2634
2635 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2636
2637 for_each_possible_cpu(cpu) {
2638 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2639 cpu_to_node(cpu));
2640 if (!entries->cpu_entries[cpu])
2641 goto fail;
2642 }
2643
2644 rcu_assign_pointer(callchain_cpus_entries, entries);
2645
2646 return 0;
2647
2648fail:
2649 for_each_possible_cpu(cpu)
2650 kfree(entries->cpu_entries[cpu]);
2651 kfree(entries);
2652
2653 return -ENOMEM;
2654}
2655
2656static int get_callchain_buffers(void)
2657{
2658 int err = 0;
2659 int count;
2660
2661 mutex_lock(&callchain_mutex);
2662
2663 count = atomic_inc_return(&nr_callchain_events);
2664 if (WARN_ON_ONCE(count < 1)) {
2665 err = -EINVAL;
2666 goto exit;
2667 }
2668
2669 if (count > 1) {
2670 /* If the allocation failed, give up */
2671 if (!callchain_cpus_entries)
2672 err = -ENOMEM;
2673 goto exit;
2674 }
2675
2676 err = alloc_callchain_buffers();
2677 if (err)
2678 release_callchain_buffers();
2679exit:
2680 mutex_unlock(&callchain_mutex);
2681
2682 return err;
2683}
2684
2685static void put_callchain_buffers(void)
2686{
2687 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2688 release_callchain_buffers();
2689 mutex_unlock(&callchain_mutex);
2690 }
2691}
2692
2693static int get_recursion_context(int *recursion)
2694{
2695 int rctx;
2696
2697 if (in_nmi())
2698 rctx = 3;
2699 else if (in_irq())
2700 rctx = 2;
2701 else if (in_softirq())
2702 rctx = 1;
2703 else
2704 rctx = 0;
2705
2706 if (recursion[rctx])
2707 return -1;
2708
2709 recursion[rctx]++;
2710 barrier();
2711
2712 return rctx;
2713}
2714
2715static inline void put_recursion_context(int *recursion, int rctx)
2716{
2717 barrier();
2718 recursion[rctx]--;
2719}
2720
2721static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2722{
2723 int cpu;
2724 struct callchain_cpus_entries *entries;
2725
2726 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2727 if (*rctx == -1)
2728 return NULL;
2729
2730 entries = rcu_dereference(callchain_cpus_entries);
2731 if (!entries)
2732 return NULL;
2733
2734 cpu = smp_processor_id();
2735
2736 return &entries->cpu_entries[cpu][*rctx];
2737}
2738
2739static void
2740put_callchain_entry(int rctx)
2741{
2742 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2743}
2744
2745static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2746{
2747 int rctx;
2748 struct perf_callchain_entry *entry;
2749
2750
2751 entry = get_callchain_entry(&rctx);
2752 if (rctx == -1)
2753 return NULL;
2754
2755 if (!entry)
2756 goto exit_put;
2757
2758 entry->nr = 0;
2759
2760 if (!user_mode(regs)) {
2761 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2762 perf_callchain_kernel(entry, regs);
2763 if (current->mm)
2764 regs = task_pt_regs(current);
2765 else
2766 regs = NULL;
2767 }
2768
2769 if (regs) {
2770 perf_callchain_store(entry, PERF_CONTEXT_USER);
2771 perf_callchain_user(entry, regs);
2772 }
2773
2774exit_put:
2775 put_callchain_entry(rctx);
2776
2777 return entry;
2778}
2779
2780/*
2705 * Initialize the perf_event context in a task_struct: 2781 * Initialize the perf_event context in a task_struct:
2706 */ 2782 */
2707static void __perf_event_init_context(struct perf_event_context *ctx) 2783static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2865,7 +2941,7 @@ static void free_event(struct perf_event *event)
2865 2941
2866 if (!event->parent) { 2942 if (!event->parent) {
2867 if (event->attach_state & PERF_ATTACH_TASK) 2943 if (event->attach_state & PERF_ATTACH_TASK)
2868 static_key_slow_dec_deferred(&perf_sched_events); 2944 jump_label_dec(&perf_sched_events);
2869 if (event->attr.mmap || event->attr.mmap_data) 2945 if (event->attr.mmap || event->attr.mmap_data)
2870 atomic_dec(&nr_mmap_events); 2946 atomic_dec(&nr_mmap_events);
2871 if (event->attr.comm) 2947 if (event->attr.comm)
@@ -2876,15 +2952,7 @@ static void free_event(struct perf_event *event)
2876 put_callchain_buffers(); 2952 put_callchain_buffers();
2877 if (is_cgroup_event(event)) { 2953 if (is_cgroup_event(event)) {
2878 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2954 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2879 static_key_slow_dec_deferred(&perf_sched_events); 2955 jump_label_dec(&perf_sched_events);
2880 }
2881
2882 if (has_branch_stack(event)) {
2883 static_key_slow_dec_deferred(&perf_sched_events);
2884 /* is system-wide event */
2885 if (!(event->attach_state & PERF_ATTACH_TASK))
2886 atomic_dec(&per_cpu(perf_branch_stack_events,
2887 event->cpu));
2888 } 2956 }
2889 } 2957 }
2890 2958
@@ -2938,12 +3006,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2938/* 3006/*
2939 * Called when the last reference to the file is gone. 3007 * Called when the last reference to the file is gone.
2940 */ 3008 */
2941static void put_event(struct perf_event *event) 3009static int perf_release(struct inode *inode, struct file *file)
2942{ 3010{
3011 struct perf_event *event = file->private_data;
2943 struct task_struct *owner; 3012 struct task_struct *owner;
2944 3013
2945 if (!atomic_long_dec_and_test(&event->refcount)) 3014 file->private_data = NULL;
2946 return;
2947 3015
2948 rcu_read_lock(); 3016 rcu_read_lock();
2949 owner = ACCESS_ONCE(event->owner); 3017 owner = ACCESS_ONCE(event->owner);
@@ -2978,13 +3046,7 @@ static void put_event(struct perf_event *event)
2978 put_task_struct(owner); 3046 put_task_struct(owner);
2979 } 3047 }
2980 3048
2981 perf_event_release_kernel(event); 3049 return perf_event_release_kernel(event);
2982}
2983
2984static int perf_release(struct inode *inode, struct file *file)
2985{
2986 put_event(file->private_data);
2987 return 0;
2988} 3050}
2989 3051
2990u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3052u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3127,33 +3189,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3127 struct ring_buffer *rb; 3189 struct ring_buffer *rb;
3128 unsigned int events = POLL_HUP; 3190 unsigned int events = POLL_HUP;
3129 3191
3130 /*
3131 * Race between perf_event_set_output() and perf_poll(): perf_poll()
3132 * grabs the rb reference but perf_event_set_output() overrides it.
3133 * Here is the timeline for two threads T1, T2:
3134 * t0: T1, rb = rcu_dereference(event->rb)
3135 * t1: T2, old_rb = event->rb
3136 * t2: T2, event->rb = new rb
3137 * t3: T2, ring_buffer_detach(old_rb)
3138 * t4: T1, ring_buffer_attach(rb1)
3139 * t5: T1, poll_wait(event->waitq)
3140 *
3141 * To avoid this problem, we grab mmap_mutex in perf_poll()
3142 * thereby ensuring that the assignment of the new ring buffer
3143 * and the detachment of the old buffer appear atomic to perf_poll()
3144 */
3145 mutex_lock(&event->mmap_mutex);
3146
3147 rcu_read_lock(); 3192 rcu_read_lock();
3148 rb = rcu_dereference(event->rb); 3193 rb = rcu_dereference(event->rb);
3149 if (rb) { 3194 if (rb)
3150 ring_buffer_attach(event, rb);
3151 events = atomic_xchg(&rb->poll, 0); 3195 events = atomic_xchg(&rb->poll, 0);
3152 }
3153 rcu_read_unlock(); 3196 rcu_read_unlock();
3154 3197
3155 mutex_unlock(&event->mmap_mutex);
3156
3157 poll_wait(file, &event->waitq, wait); 3198 poll_wait(file, &event->waitq, wait);
3158 3199
3159 return events; 3200 return events;
@@ -3196,8 +3237,9 @@ static void perf_event_for_each(struct perf_event *event,
3196 event = event->group_leader; 3237 event = event->group_leader;
3197 3238
3198 perf_event_for_each_child(event, func); 3239 perf_event_for_each_child(event, func);
3240 func(event);
3199 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3241 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3200 perf_event_for_each_child(sibling, func); 3242 perf_event_for_each_child(event, func);
3201 mutex_unlock(&ctx->mutex); 3243 mutex_unlock(&ctx->mutex);
3202} 3244}
3203 3245
@@ -3236,18 +3278,21 @@ unlock:
3236 3278
3237static const struct file_operations perf_fops; 3279static const struct file_operations perf_fops;
3238 3280
3239static inline int perf_fget_light(int fd, struct fd *p) 3281static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3240{ 3282{
3241 struct fd f = fdget(fd); 3283 struct file *file;
3242 if (!f.file)
3243 return -EBADF;
3244 3284
3245 if (f.file->f_op != &perf_fops) { 3285 file = fget_light(fd, fput_needed);
3246 fdput(f); 3286 if (!file)
3247 return -EBADF; 3287 return ERR_PTR(-EBADF);
3288
3289 if (file->f_op != &perf_fops) {
3290 fput_light(file, *fput_needed);
3291 *fput_needed = 0;
3292 return ERR_PTR(-EBADF);
3248 } 3293 }
3249 *p = f; 3294
3250 return 0; 3295 return file->private_data;
3251} 3296}
3252 3297
3253static int perf_event_set_output(struct perf_event *event, 3298static int perf_event_set_output(struct perf_event *event,
@@ -3279,19 +3324,20 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3279 3324
3280 case PERF_EVENT_IOC_SET_OUTPUT: 3325 case PERF_EVENT_IOC_SET_OUTPUT:
3281 { 3326 {
3327 struct perf_event *output_event = NULL;
3328 int fput_needed = 0;
3282 int ret; 3329 int ret;
3330
3283 if (arg != -1) { 3331 if (arg != -1) {
3284 struct perf_event *output_event; 3332 output_event = perf_fget_light(arg, &fput_needed);
3285 struct fd output; 3333 if (IS_ERR(output_event))
3286 ret = perf_fget_light(arg, &output); 3334 return PTR_ERR(output_event);
3287 if (ret)
3288 return ret;
3289 output_event = output.file->private_data;
3290 ret = perf_event_set_output(event, output_event);
3291 fdput(output);
3292 } else {
3293 ret = perf_event_set_output(event, NULL);
3294 } 3335 }
3336
3337 ret = perf_event_set_output(event, output_event);
3338 if (output_event)
3339 fput_light(output_event->filp, fput_needed);
3340
3295 return ret; 3341 return ret;
3296 } 3342 }
3297 3343
@@ -3334,6 +3380,10 @@ int perf_event_task_disable(void)
3334 return 0; 3380 return 0;
3335} 3381}
3336 3382
3383#ifndef PERF_EVENT_INDEX_OFFSET
3384# define PERF_EVENT_INDEX_OFFSET 0
3385#endif
3386
3337static int perf_event_index(struct perf_event *event) 3387static int perf_event_index(struct perf_event *event)
3338{ 3388{
3339 if (event->hw.state & PERF_HES_STOPPED) 3389 if (event->hw.state & PERF_HES_STOPPED)
@@ -3342,26 +3392,21 @@ static int perf_event_index(struct perf_event *event)
3342 if (event->state != PERF_EVENT_STATE_ACTIVE) 3392 if (event->state != PERF_EVENT_STATE_ACTIVE)
3343 return 0; 3393 return 0;
3344 3394
3345 return event->pmu->event_idx(event); 3395 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3346} 3396}
3347 3397
3348static void calc_timer_values(struct perf_event *event, 3398static void calc_timer_values(struct perf_event *event,
3349 u64 *now,
3350 u64 *enabled, 3399 u64 *enabled,
3351 u64 *running) 3400 u64 *running)
3352{ 3401{
3353 u64 ctx_time; 3402 u64 now, ctx_time;
3354 3403
3355 *now = perf_clock(); 3404 now = perf_clock();
3356 ctx_time = event->shadow_ctx_time + *now; 3405 ctx_time = event->shadow_ctx_time + now;
3357 *enabled = ctx_time - event->tstamp_enabled; 3406 *enabled = ctx_time - event->tstamp_enabled;
3358 *running = ctx_time - event->tstamp_running; 3407 *running = ctx_time - event->tstamp_running;
3359} 3408}
3360 3409
3361void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3362{
3363}
3364
3365/* 3410/*
3366 * Callers need to ensure there can be no nesting of this function, otherwise 3411 * Callers need to ensure there can be no nesting of this function, otherwise
3367 * the seqlock logic goes bad. We can not serialize this because the arch 3412 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3371,7 +3416,7 @@ void perf_event_update_userpage(struct perf_event *event)
3371{ 3416{
3372 struct perf_event_mmap_page *userpg; 3417 struct perf_event_mmap_page *userpg;
3373 struct ring_buffer *rb; 3418 struct ring_buffer *rb;
3374 u64 enabled, running, now; 3419 u64 enabled, running;
3375 3420
3376 rcu_read_lock(); 3421 rcu_read_lock();
3377 /* 3422 /*
@@ -3383,7 +3428,7 @@ void perf_event_update_userpage(struct perf_event *event)
3383 * because of locking issue as we can be called in 3428 * because of locking issue as we can be called in
3384 * NMI context 3429 * NMI context
3385 */ 3430 */
3386 calc_timer_values(event, &now, &enabled, &running); 3431 calc_timer_values(event, &enabled, &running);
3387 rb = rcu_dereference(event->rb); 3432 rb = rcu_dereference(event->rb);
3388 if (!rb) 3433 if (!rb)
3389 goto unlock; 3434 goto unlock;
@@ -3399,7 +3444,7 @@ void perf_event_update_userpage(struct perf_event *event)
3399 barrier(); 3444 barrier();
3400 userpg->index = perf_event_index(event); 3445 userpg->index = perf_event_index(event);
3401 userpg->offset = perf_event_count(event); 3446 userpg->offset = perf_event_count(event);
3402 if (userpg->index) 3447 if (event->state == PERF_EVENT_STATE_ACTIVE)
3403 userpg->offset -= local64_read(&event->hw.prev_count); 3448 userpg->offset -= local64_read(&event->hw.prev_count);
3404 3449
3405 userpg->time_enabled = enabled + 3450 userpg->time_enabled = enabled +
@@ -3408,8 +3453,6 @@ void perf_event_update_userpage(struct perf_event *event)
3408 userpg->time_running = running + 3453 userpg->time_running = running +
3409 atomic64_read(&event->child_total_time_running); 3454 atomic64_read(&event->child_total_time_running);
3410 3455
3411 arch_perf_update_userpage(userpg, now);
3412
3413 barrier(); 3456 barrier();
3414 ++userpg->lock; 3457 ++userpg->lock;
3415 preempt_enable(); 3458 preempt_enable();
@@ -3452,53 +3495,6 @@ unlock:
3452 return ret; 3495 return ret;
3453} 3496}
3454 3497
3455static void ring_buffer_attach(struct perf_event *event,
3456 struct ring_buffer *rb)
3457{
3458 unsigned long flags;
3459
3460 if (!list_empty(&event->rb_entry))
3461 return;
3462
3463 spin_lock_irqsave(&rb->event_lock, flags);
3464 if (!list_empty(&event->rb_entry))
3465 goto unlock;
3466
3467 list_add(&event->rb_entry, &rb->event_list);
3468unlock:
3469 spin_unlock_irqrestore(&rb->event_lock, flags);
3470}
3471
3472static void ring_buffer_detach(struct perf_event *event,
3473 struct ring_buffer *rb)
3474{
3475 unsigned long flags;
3476
3477 if (list_empty(&event->rb_entry))
3478 return;
3479
3480 spin_lock_irqsave(&rb->event_lock, flags);
3481 list_del_init(&event->rb_entry);
3482 wake_up_all(&event->waitq);
3483 spin_unlock_irqrestore(&rb->event_lock, flags);
3484}
3485
3486static void ring_buffer_wakeup(struct perf_event *event)
3487{
3488 struct ring_buffer *rb;
3489
3490 rcu_read_lock();
3491 rb = rcu_dereference(event->rb);
3492 if (!rb)
3493 goto unlock;
3494
3495 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3496 wake_up_all(&event->waitq);
3497
3498unlock:
3499 rcu_read_unlock();
3500}
3501
3502static void rb_free_rcu(struct rcu_head *rcu_head) 3498static void rb_free_rcu(struct rcu_head *rcu_head)
3503{ 3499{
3504 struct ring_buffer *rb; 3500 struct ring_buffer *rb;
@@ -3524,19 +3520,9 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3524 3520
3525static void ring_buffer_put(struct ring_buffer *rb) 3521static void ring_buffer_put(struct ring_buffer *rb)
3526{ 3522{
3527 struct perf_event *event, *n;
3528 unsigned long flags;
3529
3530 if (!atomic_dec_and_test(&rb->refcount)) 3523 if (!atomic_dec_and_test(&rb->refcount))
3531 return; 3524 return;
3532 3525
3533 spin_lock_irqsave(&rb->event_lock, flags);
3534 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3535 list_del_init(&event->rb_entry);
3536 wake_up_all(&event->waitq);
3537 }
3538 spin_unlock_irqrestore(&rb->event_lock, flags);
3539
3540 call_rcu(&rb->rcu_head, rb_free_rcu); 3526 call_rcu(&rb->rcu_head, rb_free_rcu);
3541} 3527}
3542 3528
@@ -3557,9 +3543,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3557 struct ring_buffer *rb = event->rb; 3543 struct ring_buffer *rb = event->rb;
3558 3544
3559 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3545 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3560 vma->vm_mm->pinned_vm -= event->mmap_locked; 3546 vma->vm_mm->locked_vm -= event->mmap_locked;
3561 rcu_assign_pointer(event->rb, NULL); 3547 rcu_assign_pointer(event->rb, NULL);
3562 ring_buffer_detach(event, rb);
3563 mutex_unlock(&event->mmap_mutex); 3548 mutex_unlock(&event->mmap_mutex);
3564 3549
3565 ring_buffer_put(rb); 3550 ring_buffer_put(rb);
@@ -3639,7 +3624,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3639 3624
3640 lock_limit = rlimit(RLIMIT_MEMLOCK); 3625 lock_limit = rlimit(RLIMIT_MEMLOCK);
3641 lock_limit >>= PAGE_SHIFT; 3626 lock_limit >>= PAGE_SHIFT;
3642 locked = vma->vm_mm->pinned_vm + extra; 3627 locked = vma->vm_mm->locked_vm + extra;
3643 3628
3644 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 3629 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3645 !capable(CAP_IPC_LOCK)) { 3630 !capable(CAP_IPC_LOCK)) {
@@ -3665,16 +3650,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3665 atomic_long_add(user_extra, &user->locked_vm); 3650 atomic_long_add(user_extra, &user->locked_vm);
3666 event->mmap_locked = extra; 3651 event->mmap_locked = extra;
3667 event->mmap_user = get_current_user(); 3652 event->mmap_user = get_current_user();
3668 vma->vm_mm->pinned_vm += event->mmap_locked; 3653 vma->vm_mm->locked_vm += event->mmap_locked;
3669
3670 perf_event_update_userpage(event);
3671 3654
3672unlock: 3655unlock:
3673 if (!ret) 3656 if (!ret)
3674 atomic_inc(&event->mmap_count); 3657 atomic_inc(&event->mmap_count);
3675 mutex_unlock(&event->mmap_mutex); 3658 mutex_unlock(&event->mmap_mutex);
3676 3659
3677 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3660 vma->vm_flags |= VM_RESERVED;
3678 vma->vm_ops = &perf_mmap_vmops; 3661 vma->vm_ops = &perf_mmap_vmops;
3679 3662
3680 return ret; 3663 return ret;
@@ -3716,7 +3699,7 @@ static const struct file_operations perf_fops = {
3716 3699
3717void perf_event_wakeup(struct perf_event *event) 3700void perf_event_wakeup(struct perf_event *event)
3718{ 3701{
3719 ring_buffer_wakeup(event); 3702 wake_up_all(&event->waitq);
3720 3703
3721 if (event->pending_kill) { 3704 if (event->pending_kill) {
3722 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 3705 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -3761,132 +3744,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3761} 3744}
3762EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3745EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3763 3746
3764static void
3765perf_output_sample_regs(struct perf_output_handle *handle,
3766 struct pt_regs *regs, u64 mask)
3767{
3768 int bit;
3769
3770 for_each_set_bit(bit, (const unsigned long *) &mask,
3771 sizeof(mask) * BITS_PER_BYTE) {
3772 u64 val;
3773
3774 val = perf_reg_value(regs, bit);
3775 perf_output_put(handle, val);
3776 }
3777}
3778
3779static void perf_sample_regs_user(struct perf_regs_user *regs_user,
3780 struct pt_regs *regs)
3781{
3782 if (!user_mode(regs)) {
3783 if (current->mm)
3784 regs = task_pt_regs(current);
3785 else
3786 regs = NULL;
3787 }
3788
3789 if (regs) {
3790 regs_user->regs = regs;
3791 regs_user->abi = perf_reg_abi(current);
3792 }
3793}
3794
3795/*
3796 * Get remaining task size from user stack pointer.
3797 *
3798 * It'd be better to take stack vma map and limit this more
3799 * precisly, but there's no way to get it safely under interrupt,
3800 * so using TASK_SIZE as limit.
3801 */
3802static u64 perf_ustack_task_size(struct pt_regs *regs)
3803{
3804 unsigned long addr = perf_user_stack_pointer(regs);
3805
3806 if (!addr || addr >= TASK_SIZE)
3807 return 0;
3808
3809 return TASK_SIZE - addr;
3810}
3811
3812static u16
3813perf_sample_ustack_size(u16 stack_size, u16 header_size,
3814 struct pt_regs *regs)
3815{
3816 u64 task_size;
3817
3818 /* No regs, no stack pointer, no dump. */
3819 if (!regs)
3820 return 0;
3821
3822 /*
3823 * Check if we fit in with the requested stack size into the:
3824 * - TASK_SIZE
3825 * If we don't, we limit the size to the TASK_SIZE.
3826 *
3827 * - remaining sample size
3828 * If we don't, we customize the stack size to
3829 * fit in to the remaining sample size.
3830 */
3831
3832 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
3833 stack_size = min(stack_size, (u16) task_size);
3834
3835 /* Current header size plus static size and dynamic size. */
3836 header_size += 2 * sizeof(u64);
3837
3838 /* Do we fit in with the current stack dump size? */
3839 if ((u16) (header_size + stack_size) < header_size) {
3840 /*
3841 * If we overflow the maximum size for the sample,
3842 * we customize the stack dump size to fit in.
3843 */
3844 stack_size = USHRT_MAX - header_size - sizeof(u64);
3845 stack_size = round_up(stack_size, sizeof(u64));
3846 }
3847
3848 return stack_size;
3849}
3850
3851static void
3852perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
3853 struct pt_regs *regs)
3854{
3855 /* Case of a kernel thread, nothing to dump */
3856 if (!regs) {
3857 u64 size = 0;
3858 perf_output_put(handle, size);
3859 } else {
3860 unsigned long sp;
3861 unsigned int rem;
3862 u64 dyn_size;
3863
3864 /*
3865 * We dump:
3866 * static size
3867 * - the size requested by user or the best one we can fit
3868 * in to the sample max size
3869 * data
3870 * - user stack dump data
3871 * dynamic size
3872 * - the actual dumped size
3873 */
3874
3875 /* Static size. */
3876 perf_output_put(handle, dump_size);
3877
3878 /* Data. */
3879 sp = perf_user_stack_pointer(regs);
3880 rem = __output_copy_user(handle, (void *) sp, dump_size);
3881 dyn_size = dump_size - rem;
3882
3883 perf_output_skip(handle, rem);
3884
3885 /* Dynamic size. */
3886 perf_output_put(handle, dyn_size);
3887 }
3888}
3889
3890static void __perf_event_header__init_id(struct perf_event_header *header, 3747static void __perf_event_header__init_id(struct perf_event_header *header,
3891 struct perf_sample_data *data, 3748 struct perf_sample_data *data,
3892 struct perf_event *event) 3749 struct perf_event *event)
@@ -4026,7 +3883,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4026static void perf_output_read(struct perf_output_handle *handle, 3883static void perf_output_read(struct perf_output_handle *handle,
4027 struct perf_event *event) 3884 struct perf_event *event)
4028{ 3885{
4029 u64 enabled = 0, running = 0, now; 3886 u64 enabled = 0, running = 0;
4030 u64 read_format = event->attr.read_format; 3887 u64 read_format = event->attr.read_format;
4031 3888
4032 /* 3889 /*
@@ -4039,7 +3896,7 @@ static void perf_output_read(struct perf_output_handle *handle,
4039 * NMI context 3896 * NMI context
4040 */ 3897 */
4041 if (read_format & PERF_FORMAT_TOTAL_TIMES) 3898 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4042 calc_timer_values(event, &now, &enabled, &running); 3899 calc_timer_values(event, &enabled, &running);
4043 3900
4044 if (event->attr.read_format & PERF_FORMAT_GROUP) 3901 if (event->attr.read_format & PERF_FORMAT_GROUP)
4045 perf_output_read_group(handle, event, enabled, running); 3902 perf_output_read_group(handle, event, enabled, running);
@@ -4129,46 +3986,6 @@ void perf_output_sample(struct perf_output_handle *handle,
4129 } 3986 }
4130 } 3987 }
4131 } 3988 }
4132
4133 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4134 if (data->br_stack) {
4135 size_t size;
4136
4137 size = data->br_stack->nr
4138 * sizeof(struct perf_branch_entry);
4139
4140 perf_output_put(handle, data->br_stack->nr);
4141 perf_output_copy(handle, data->br_stack->entries, size);
4142 } else {
4143 /*
4144 * we always store at least the value of nr
4145 */
4146 u64 nr = 0;
4147 perf_output_put(handle, nr);
4148 }
4149 }
4150
4151 if (sample_type & PERF_SAMPLE_REGS_USER) {
4152 u64 abi = data->regs_user.abi;
4153
4154 /*
4155 * If there are no regs to dump, notice it through
4156 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4157 */
4158 perf_output_put(handle, abi);
4159
4160 if (abi) {
4161 u64 mask = event->attr.sample_regs_user;
4162 perf_output_sample_regs(handle,
4163 data->regs_user.regs,
4164 mask);
4165 }
4166 }
4167
4168 if (sample_type & PERF_SAMPLE_STACK_USER)
4169 perf_output_sample_ustack(handle,
4170 data->stack_user_size,
4171 data->regs_user.regs);
4172} 3989}
4173 3990
4174void perf_prepare_sample(struct perf_event_header *header, 3991void perf_prepare_sample(struct perf_event_header *header,
@@ -4192,7 +4009,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4192 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4009 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4193 int size = 1; 4010 int size = 1;
4194 4011
4195 data->callchain = perf_callchain(event, regs); 4012 data->callchain = perf_callchain(regs);
4196 4013
4197 if (data->callchain) 4014 if (data->callchain)
4198 size += data->callchain->nr; 4015 size += data->callchain->nr;
@@ -4211,58 +4028,6 @@ void perf_prepare_sample(struct perf_event_header *header,
4211 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4028 WARN_ON_ONCE(size & (sizeof(u64)-1));
4212 header->size += size; 4029 header->size += size;
4213 } 4030 }
4214
4215 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4216 int size = sizeof(u64); /* nr */
4217 if (data->br_stack) {
4218 size += data->br_stack->nr
4219 * sizeof(struct perf_branch_entry);
4220 }
4221 header->size += size;
4222 }
4223
4224 if (sample_type & PERF_SAMPLE_REGS_USER) {
4225 /* regs dump ABI info */
4226 int size = sizeof(u64);
4227
4228 perf_sample_regs_user(&data->regs_user, regs);
4229
4230 if (data->regs_user.regs) {
4231 u64 mask = event->attr.sample_regs_user;
4232 size += hweight64(mask) * sizeof(u64);
4233 }
4234
4235 header->size += size;
4236 }
4237
4238 if (sample_type & PERF_SAMPLE_STACK_USER) {
4239 /*
4240 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
4241 * processed as the last one or have additional check added
4242 * in case new sample type is added, because we could eat
4243 * up the rest of the sample size.
4244 */
4245 struct perf_regs_user *uregs = &data->regs_user;
4246 u16 stack_size = event->attr.sample_stack_user;
4247 u16 size = sizeof(u64);
4248
4249 if (!uregs->abi)
4250 perf_sample_regs_user(uregs, regs);
4251
4252 stack_size = perf_sample_ustack_size(stack_size, header->size,
4253 uregs->regs);
4254
4255 /*
4256 * If there is something to dump, add space for the dump
4257 * itself and for the field that tells the dynamic size,
4258 * which is how many have been actually dumped.
4259 */
4260 if (stack_size)
4261 size += sizeof(u64) + stack_size;
4262
4263 data->stack_user_size = stack_size;
4264 header->size += size;
4265 }
4266} 4031}
4267 4032
4268static void perf_event_output(struct perf_event *event, 4033static void perf_event_output(struct perf_event *event,
@@ -4415,7 +4180,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
4415 rcu_read_lock(); 4180 rcu_read_lock();
4416 list_for_each_entry_rcu(pmu, &pmus, entry) { 4181 list_for_each_entry_rcu(pmu, &pmus, entry) {
4417 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4182 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4418 if (cpuctx->unique_pmu != pmu) 4183 if (cpuctx->active_pmu != pmu)
4419 goto next; 4184 goto next;
4420 perf_event_task_ctx(&cpuctx->ctx, task_event); 4185 perf_event_task_ctx(&cpuctx->ctx, task_event);
4421 4186
@@ -4561,7 +4326,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4561 rcu_read_lock(); 4326 rcu_read_lock();
4562 list_for_each_entry_rcu(pmu, &pmus, entry) { 4327 list_for_each_entry_rcu(pmu, &pmus, entry) {
4563 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4328 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4564 if (cpuctx->unique_pmu != pmu) 4329 if (cpuctx->active_pmu != pmu)
4565 goto next; 4330 goto next;
4566 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4331 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4567 4332
@@ -4757,7 +4522,7 @@ got_name:
4757 rcu_read_lock(); 4522 rcu_read_lock();
4758 list_for_each_entry_rcu(pmu, &pmus, entry) { 4523 list_for_each_entry_rcu(pmu, &pmus, entry) {
4759 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4524 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4760 if (cpuctx->unique_pmu != pmu) 4525 if (cpuctx->active_pmu != pmu)
4761 goto next; 4526 goto next;
4762 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4527 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4763 vma->vm_flags & VM_EXEC); 4528 vma->vm_flags & VM_EXEC);
@@ -4858,7 +4623,6 @@ static int __perf_event_overflow(struct perf_event *event,
4858{ 4623{
4859 int events = atomic_read(&event->event_limit); 4624 int events = atomic_read(&event->event_limit);
4860 struct hw_perf_event *hwc = &event->hw; 4625 struct hw_perf_event *hwc = &event->hw;
4861 u64 seq;
4862 int ret = 0; 4626 int ret = 0;
4863 4627
4864 /* 4628 /*
@@ -4868,20 +4632,14 @@ static int __perf_event_overflow(struct perf_event *event,
4868 if (unlikely(!is_sampling_event(event))) 4632 if (unlikely(!is_sampling_event(event)))
4869 return 0; 4633 return 0;
4870 4634
4871 seq = __this_cpu_read(perf_throttled_seq); 4635 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4872 if (seq != hwc->interrupts_seq) { 4636 if (throttle) {
4873 hwc->interrupts_seq = seq;
4874 hwc->interrupts = 1;
4875 } else {
4876 hwc->interrupts++;
4877 if (unlikely(throttle
4878 && hwc->interrupts >= max_samples_per_tick)) {
4879 __this_cpu_inc(perf_throttled_count);
4880 hwc->interrupts = MAX_INTERRUPTS; 4637 hwc->interrupts = MAX_INTERRUPTS;
4881 perf_log_throttle(event, 0); 4638 perf_log_throttle(event, 0);
4882 ret = 1; 4639 ret = 1;
4883 } 4640 }
4884 } 4641 } else
4642 hwc->interrupts++;
4885 4643
4886 if (event->attr.freq) { 4644 if (event->attr.freq) {
4887 u64 now = perf_clock(); 4645 u64 now = perf_clock();
@@ -4890,7 +4648,7 @@ static int __perf_event_overflow(struct perf_event *event,
4890 hwc->freq_time_stamp = now; 4648 hwc->freq_time_stamp = now;
4891 4649
4892 if (delta > 0 && delta < 2*TICK_NSEC) 4650 if (delta > 0 && delta < 2*TICK_NSEC)
4893 perf_adjust_period(event, delta, hwc->last_period, true); 4651 perf_adjust_period(event, delta, hwc->last_period);
4894 } 4652 }
4895 4653
4896 /* 4654 /*
@@ -4978,6 +4736,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4978 struct hw_perf_event *hwc = &event->hw; 4736 struct hw_perf_event *hwc = &event->hw;
4979 int throttle = 0; 4737 int throttle = 0;
4980 4738
4739 data->period = event->hw.last_period;
4981 if (!overflow) 4740 if (!overflow)
4982 overflow = perf_swevent_set_period(event); 4741 overflow = perf_swevent_set_period(event);
4983 4742
@@ -5011,12 +4770,6 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5011 if (!is_sampling_event(event)) 4770 if (!is_sampling_event(event))
5012 return; 4771 return;
5013 4772
5014 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
5015 data->period = nr;
5016 return perf_swevent_overflow(event, 1, data, regs);
5017 } else
5018 data->period = event->hw.last_period;
5019
5020 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4773 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5021 return perf_swevent_overflow(event, 1, data, regs); 4774 return perf_swevent_overflow(event, 1, data, regs);
5022 4775
@@ -5158,7 +4911,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5158 if (rctx < 0) 4911 if (rctx < 0)
5159 return; 4912 return;
5160 4913
5161 perf_sample_data_init(&data, addr, 0); 4914 perf_sample_data_init(&data, addr);
5162 4915
5163 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 4916 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5164 4917
@@ -5305,7 +5058,7 @@ fail:
5305 return err; 5058 return err;
5306} 5059}
5307 5060
5308struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5061struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5309 5062
5310static void sw_perf_event_destroy(struct perf_event *event) 5063static void sw_perf_event_destroy(struct perf_event *event)
5311{ 5064{
@@ -5313,7 +5066,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
5313 5066
5314 WARN_ON(event->parent); 5067 WARN_ON(event->parent);
5315 5068
5316 static_key_slow_dec(&perf_swevent_enabled[event_id]); 5069 jump_label_dec(&perf_swevent_enabled[event_id]);
5317 swevent_hlist_put(event); 5070 swevent_hlist_put(event);
5318} 5071}
5319 5072
@@ -5324,12 +5077,6 @@ static int perf_swevent_init(struct perf_event *event)
5324 if (event->attr.type != PERF_TYPE_SOFTWARE) 5077 if (event->attr.type != PERF_TYPE_SOFTWARE)
5325 return -ENOENT; 5078 return -ENOENT;
5326 5079
5327 /*
5328 * no branch sampling for software events
5329 */
5330 if (has_branch_stack(event))
5331 return -EOPNOTSUPP;
5332
5333 switch (event_id) { 5080 switch (event_id) {
5334 case PERF_COUNT_SW_CPU_CLOCK: 5081 case PERF_COUNT_SW_CPU_CLOCK:
5335 case PERF_COUNT_SW_TASK_CLOCK: 5082 case PERF_COUNT_SW_TASK_CLOCK:
@@ -5349,18 +5096,13 @@ static int perf_swevent_init(struct perf_event *event)
5349 if (err) 5096 if (err)
5350 return err; 5097 return err;
5351 5098
5352 static_key_slow_inc(&perf_swevent_enabled[event_id]); 5099 jump_label_inc(&perf_swevent_enabled[event_id]);
5353 event->destroy = sw_perf_event_destroy; 5100 event->destroy = sw_perf_event_destroy;
5354 } 5101 }
5355 5102
5356 return 0; 5103 return 0;
5357} 5104}
5358 5105
5359static int perf_swevent_event_idx(struct perf_event *event)
5360{
5361 return 0;
5362}
5363
5364static struct pmu perf_swevent = { 5106static struct pmu perf_swevent = {
5365 .task_ctx_nr = perf_sw_context, 5107 .task_ctx_nr = perf_sw_context,
5366 5108
@@ -5370,8 +5112,6 @@ static struct pmu perf_swevent = {
5370 .start = perf_swevent_start, 5112 .start = perf_swevent_start,
5371 .stop = perf_swevent_stop, 5113 .stop = perf_swevent_stop,
5372 .read = perf_swevent_read, 5114 .read = perf_swevent_read,
5373
5374 .event_idx = perf_swevent_event_idx,
5375}; 5115};
5376 5116
5377#ifdef CONFIG_EVENT_TRACING 5117#ifdef CONFIG_EVENT_TRACING
@@ -5405,8 +5145,7 @@ static int perf_tp_event_match(struct perf_event *event,
5405} 5145}
5406 5146
5407void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 5147void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5408 struct pt_regs *regs, struct hlist_head *head, int rctx, 5148 struct pt_regs *regs, struct hlist_head *head, int rctx)
5409 struct task_struct *task)
5410{ 5149{
5411 struct perf_sample_data data; 5150 struct perf_sample_data data;
5412 struct perf_event *event; 5151 struct perf_event *event;
@@ -5417,7 +5156,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5417 .data = record, 5156 .data = record,
5418 }; 5157 };
5419 5158
5420 perf_sample_data_init(&data, addr, 0); 5159 perf_sample_data_init(&data, addr);
5421 data.raw = &raw; 5160 data.raw = &raw;
5422 5161
5423 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5162 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5425,31 +5164,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5425 perf_swevent_event(event, count, &data, regs); 5164 perf_swevent_event(event, count, &data, regs);
5426 } 5165 }
5427 5166
5428 /*
5429 * If we got specified a target task, also iterate its context and
5430 * deliver this event there too.
5431 */
5432 if (task && task != current) {
5433 struct perf_event_context *ctx;
5434 struct trace_entry *entry = record;
5435
5436 rcu_read_lock();
5437 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5438 if (!ctx)
5439 goto unlock;
5440
5441 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5442 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5443 continue;
5444 if (event->attr.config != entry->type)
5445 continue;
5446 if (perf_tp_event_match(event, &data, regs))
5447 perf_swevent_event(event, count, &data, regs);
5448 }
5449unlock:
5450 rcu_read_unlock();
5451 }
5452
5453 perf_swevent_put_recursion_context(rctx); 5167 perf_swevent_put_recursion_context(rctx);
5454} 5168}
5455EXPORT_SYMBOL_GPL(perf_tp_event); 5169EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5466,12 +5180,6 @@ static int perf_tp_event_init(struct perf_event *event)
5466 if (event->attr.type != PERF_TYPE_TRACEPOINT) 5180 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5467 return -ENOENT; 5181 return -ENOENT;
5468 5182
5469 /*
5470 * no branch sampling for tracepoint events
5471 */
5472 if (has_branch_stack(event))
5473 return -EOPNOTSUPP;
5474
5475 err = perf_trace_init(event); 5183 err = perf_trace_init(event);
5476 if (err) 5184 if (err)
5477 return err; 5185 return err;
@@ -5490,8 +5198,6 @@ static struct pmu perf_tracepoint = {
5490 .start = perf_swevent_start, 5198 .start = perf_swevent_start,
5491 .stop = perf_swevent_stop, 5199 .stop = perf_swevent_stop,
5492 .read = perf_swevent_read, 5200 .read = perf_swevent_read,
5493
5494 .event_idx = perf_swevent_event_idx,
5495}; 5201};
5496 5202
5497static inline void perf_tp_register(void) 5203static inline void perf_tp_register(void)
@@ -5545,7 +5251,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5545 struct perf_sample_data sample; 5251 struct perf_sample_data sample;
5546 struct pt_regs *regs = data; 5252 struct pt_regs *regs = data;
5547 5253
5548 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 5254 perf_sample_data_init(&sample, bp->attr.bp_addr);
5549 5255
5550 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5256 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5551 perf_swevent_event(bp, 1, &sample, regs); 5257 perf_swevent_event(bp, 1, &sample, regs);
@@ -5571,12 +5277,13 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5571 5277
5572 event->pmu->read(event); 5278 event->pmu->read(event);
5573 5279
5574 perf_sample_data_init(&data, 0, event->hw.last_period); 5280 perf_sample_data_init(&data, 0);
5281 data.period = event->hw.last_period;
5575 regs = get_irq_regs(); 5282 regs = get_irq_regs();
5576 5283
5577 if (regs && !perf_exclude_event(event, regs)) { 5284 if (regs && !perf_exclude_event(event, regs)) {
5578 if (!(event->attr.exclude_idle && is_idle_task(current))) 5285 if (!(event->attr.exclude_idle && current->pid == 0))
5579 if (__perf_event_overflow(event, 1, &data, regs)) 5286 if (perf_event_overflow(event, &data, regs))
5580 ret = HRTIMER_NORESTART; 5287 ret = HRTIMER_NORESTART;
5581 } 5288 }
5582 5289
@@ -5696,12 +5403,6 @@ static int cpu_clock_event_init(struct perf_event *event)
5696 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5403 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5697 return -ENOENT; 5404 return -ENOENT;
5698 5405
5699 /*
5700 * no branch sampling for software events
5701 */
5702 if (has_branch_stack(event))
5703 return -EOPNOTSUPP;
5704
5705 perf_swevent_init_hrtimer(event); 5406 perf_swevent_init_hrtimer(event);
5706 5407
5707 return 0; 5408 return 0;
@@ -5716,8 +5417,6 @@ static struct pmu perf_cpu_clock = {
5716 .start = cpu_clock_event_start, 5417 .start = cpu_clock_event_start,
5717 .stop = cpu_clock_event_stop, 5418 .stop = cpu_clock_event_stop,
5718 .read = cpu_clock_event_read, 5419 .read = cpu_clock_event_read,
5719
5720 .event_idx = perf_swevent_event_idx,
5721}; 5420};
5722 5421
5723/* 5422/*
@@ -5776,12 +5475,6 @@ static int task_clock_event_init(struct perf_event *event)
5776 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5475 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5777 return -ENOENT; 5476 return -ENOENT;
5778 5477
5779 /*
5780 * no branch sampling for software events
5781 */
5782 if (has_branch_stack(event))
5783 return -EOPNOTSUPP;
5784
5785 perf_swevent_init_hrtimer(event); 5478 perf_swevent_init_hrtimer(event);
5786 5479
5787 return 0; 5480 return 0;
@@ -5796,8 +5489,6 @@ static struct pmu perf_task_clock = {
5796 .start = task_clock_event_start, 5489 .start = task_clock_event_start,
5797 .stop = task_clock_event_stop, 5490 .stop = task_clock_event_stop,
5798 .read = task_clock_event_read, 5491 .read = task_clock_event_read,
5799
5800 .event_idx = perf_swevent_event_idx,
5801}; 5492};
5802 5493
5803static void perf_pmu_nop_void(struct pmu *pmu) 5494static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5825,11 +5516,6 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
5825 perf_pmu_enable(pmu); 5516 perf_pmu_enable(pmu);
5826} 5517}
5827 5518
5828static int perf_event_idx_default(struct perf_event *event)
5829{
5830 return event->hw.idx + 1;
5831}
5832
5833/* 5519/*
5834 * Ensures all contexts with the same task_ctx_nr have the same 5520 * Ensures all contexts with the same task_ctx_nr have the same
5835 * pmu_cpu_context too. 5521 * pmu_cpu_context too.
@@ -5858,8 +5544,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5858 5544
5859 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5545 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5860 5546
5861 if (cpuctx->unique_pmu == old_pmu) 5547 if (cpuctx->active_pmu == old_pmu)
5862 cpuctx->unique_pmu = pmu; 5548 cpuctx->active_pmu = pmu;
5863 } 5549 }
5864} 5550}
5865 5551
@@ -5916,7 +5602,6 @@ static int pmu_dev_alloc(struct pmu *pmu)
5916 if (!pmu->dev) 5602 if (!pmu->dev)
5917 goto out; 5603 goto out;
5918 5604
5919 pmu->dev->groups = pmu->attr_groups;
5920 device_initialize(pmu->dev); 5605 device_initialize(pmu->dev);
5921 ret = dev_set_name(pmu->dev, "%s", pmu->name); 5606 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5922 if (ret) 5607 if (ret)
@@ -5994,7 +5679,7 @@ skip_type:
5994 cpuctx->ctx.pmu = pmu; 5679 cpuctx->ctx.pmu = pmu;
5995 cpuctx->jiffies_interval = 1; 5680 cpuctx->jiffies_interval = 1;
5996 INIT_LIST_HEAD(&cpuctx->rotation_list); 5681 INIT_LIST_HEAD(&cpuctx->rotation_list);
5997 cpuctx->unique_pmu = pmu; 5682 cpuctx->active_pmu = pmu;
5998 } 5683 }
5999 5684
6000got_cpu_context: 5685got_cpu_context:
@@ -6020,9 +5705,6 @@ got_cpu_context:
6020 pmu->pmu_disable = perf_pmu_nop_void; 5705 pmu->pmu_disable = perf_pmu_nop_void;
6021 } 5706 }
6022 5707
6023 if (!pmu->event_idx)
6024 pmu->event_idx = perf_event_idx_default;
6025
6026 list_add_rcu(&pmu->entry, &pmus); 5708 list_add_rcu(&pmu->entry, &pmus);
6027 ret = 0; 5709 ret = 0;
6028unlock: 5710unlock:
@@ -6076,7 +5758,6 @@ struct pmu *perf_init_event(struct perf_event *event)
6076 pmu = idr_find(&pmu_idr, event->attr.type); 5758 pmu = idr_find(&pmu_idr, event->attr.type);
6077 rcu_read_unlock(); 5759 rcu_read_unlock();
6078 if (pmu) { 5760 if (pmu) {
6079 event->pmu = pmu;
6080 ret = pmu->event_init(event); 5761 ret = pmu->event_init(event);
6081 if (ret) 5762 if (ret)
6082 pmu = ERR_PTR(ret); 5763 pmu = ERR_PTR(ret);
@@ -6084,7 +5765,6 @@ struct pmu *perf_init_event(struct perf_event *event)
6084 } 5765 }
6085 5766
6086 list_for_each_entry_rcu(pmu, &pmus, entry) { 5767 list_for_each_entry_rcu(pmu, &pmus, entry) {
6087 event->pmu = pmu;
6088 ret = pmu->event_init(event); 5768 ret = pmu->event_init(event);
6089 if (!ret) 5769 if (!ret)
6090 goto unlock; 5770 goto unlock;
@@ -6139,14 +5819,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6139 INIT_LIST_HEAD(&event->group_entry); 5819 INIT_LIST_HEAD(&event->group_entry);
6140 INIT_LIST_HEAD(&event->event_entry); 5820 INIT_LIST_HEAD(&event->event_entry);
6141 INIT_LIST_HEAD(&event->sibling_list); 5821 INIT_LIST_HEAD(&event->sibling_list);
6142 INIT_LIST_HEAD(&event->rb_entry);
6143
6144 init_waitqueue_head(&event->waitq); 5822 init_waitqueue_head(&event->waitq);
6145 init_irq_work(&event->pending, perf_pending_event); 5823 init_irq_work(&event->pending, perf_pending_event);
6146 5824
6147 mutex_init(&event->mmap_mutex); 5825 mutex_init(&event->mmap_mutex);
6148 5826
6149 atomic_long_set(&event->refcount, 1);
6150 event->cpu = cpu; 5827 event->cpu = cpu;
6151 event->attr = *attr; 5828 event->attr = *attr;
6152 event->group_leader = group_leader; 5829 event->group_leader = group_leader;
@@ -6155,7 +5832,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6155 5832
6156 event->parent = parent_event; 5833 event->parent = parent_event;
6157 5834
6158 event->ns = get_pid_ns(task_active_pid_ns(current)); 5835 event->ns = get_pid_ns(current->nsproxy->pid_ns);
6159 event->id = atomic64_inc_return(&perf_event_id); 5836 event->id = atomic64_inc_return(&perf_event_id);
6160 5837
6161 event->state = PERF_EVENT_STATE_INACTIVE; 5838 event->state = PERF_EVENT_STATE_INACTIVE;
@@ -6214,9 +5891,11 @@ done:
6214 return ERR_PTR(err); 5891 return ERR_PTR(err);
6215 } 5892 }
6216 5893
5894 event->pmu = pmu;
5895
6217 if (!event->parent) { 5896 if (!event->parent) {
6218 if (event->attach_state & PERF_ATTACH_TASK) 5897 if (event->attach_state & PERF_ATTACH_TASK)
6219 static_key_slow_inc(&perf_sched_events.key); 5898 jump_label_inc(&perf_sched_events);
6220 if (event->attr.mmap || event->attr.mmap_data) 5899 if (event->attr.mmap || event->attr.mmap_data)
6221 atomic_inc(&nr_mmap_events); 5900 atomic_inc(&nr_mmap_events);
6222 if (event->attr.comm) 5901 if (event->attr.comm)
@@ -6230,12 +5909,6 @@ done:
6230 return ERR_PTR(err); 5909 return ERR_PTR(err);
6231 } 5910 }
6232 } 5911 }
6233 if (has_branch_stack(event)) {
6234 static_key_slow_inc(&perf_sched_events.key);
6235 if (!(event->attach_state & PERF_ATTACH_TASK))
6236 atomic_inc(&per_cpu(perf_branch_stack_events,
6237 event->cpu));
6238 }
6239 } 5912 }
6240 5913
6241 return event; 5914 return event;
@@ -6305,62 +5978,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6305 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 5978 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6306 return -EINVAL; 5979 return -EINVAL;
6307 5980
6308 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6309 u64 mask = attr->branch_sample_type;
6310
6311 /* only using defined bits */
6312 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6313 return -EINVAL;
6314
6315 /* at least one branch bit must be set */
6316 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6317 return -EINVAL;
6318
6319 /* kernel level capture: check permissions */
6320 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6321 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6322 return -EACCES;
6323
6324 /* propagate priv level, when not set for branch */
6325 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6326
6327 /* exclude_kernel checked on syscall entry */
6328 if (!attr->exclude_kernel)
6329 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6330
6331 if (!attr->exclude_user)
6332 mask |= PERF_SAMPLE_BRANCH_USER;
6333
6334 if (!attr->exclude_hv)
6335 mask |= PERF_SAMPLE_BRANCH_HV;
6336 /*
6337 * adjust user setting (for HW filter setup)
6338 */
6339 attr->branch_sample_type = mask;
6340 }
6341 }
6342
6343 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
6344 ret = perf_reg_validate(attr->sample_regs_user);
6345 if (ret)
6346 return ret;
6347 }
6348
6349 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
6350 if (!arch_perf_have_user_stack_dump())
6351 return -ENOSYS;
6352
6353 /*
6354 * We have __u32 type for the size, but so far
6355 * we can only use __u16 as maximum due to the
6356 * __u16 sample size limit.
6357 */
6358 if (attr->sample_stack_user >= USHRT_MAX)
6359 ret = -EINVAL;
6360 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
6361 ret = -EINVAL;
6362 }
6363
6364out: 5981out:
6365 return ret; 5982 return ret;
6366 5983
@@ -6410,8 +6027,6 @@ set:
6410 6027
6411 old_rb = event->rb; 6028 old_rb = event->rb;
6412 rcu_assign_pointer(event->rb, rb); 6029 rcu_assign_pointer(event->rb, rb);
6413 if (old_rb)
6414 ring_buffer_detach(event, old_rb);
6415 ret = 0; 6030 ret = 0;
6416unlock: 6031unlock:
6417 mutex_unlock(&event->mmap_mutex); 6032 mutex_unlock(&event->mmap_mutex);
@@ -6439,11 +6054,12 @@ SYSCALL_DEFINE5(perf_event_open,
6439 struct perf_event_attr attr; 6054 struct perf_event_attr attr;
6440 struct perf_event_context *ctx; 6055 struct perf_event_context *ctx;
6441 struct file *event_file = NULL; 6056 struct file *event_file = NULL;
6442 struct fd group = {NULL, 0}; 6057 struct file *group_file = NULL;
6443 struct task_struct *task = NULL; 6058 struct task_struct *task = NULL;
6444 struct pmu *pmu; 6059 struct pmu *pmu;
6445 int event_fd; 6060 int event_fd;
6446 int move_group = 0; 6061 int move_group = 0;
6062 int fput_needed = 0;
6447 int err; 6063 int err;
6448 6064
6449 /* for future expandability... */ 6065 /* for future expandability... */
@@ -6473,15 +6089,17 @@ SYSCALL_DEFINE5(perf_event_open,
6473 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 6089 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6474 return -EINVAL; 6090 return -EINVAL;
6475 6091
6476 event_fd = get_unused_fd(); 6092 event_fd = get_unused_fd_flags(O_RDWR);
6477 if (event_fd < 0) 6093 if (event_fd < 0)
6478 return event_fd; 6094 return event_fd;
6479 6095
6480 if (group_fd != -1) { 6096 if (group_fd != -1) {
6481 err = perf_fget_light(group_fd, &group); 6097 group_leader = perf_fget_light(group_fd, &fput_needed);
6482 if (err) 6098 if (IS_ERR(group_leader)) {
6099 err = PTR_ERR(group_leader);
6483 goto err_fd; 6100 goto err_fd;
6484 group_leader = group.file->private_data; 6101 }
6102 group_file = group_leader->filp;
6485 if (flags & PERF_FLAG_FD_OUTPUT) 6103 if (flags & PERF_FLAG_FD_OUTPUT)
6486 output_event = group_leader; 6104 output_event = group_leader;
6487 if (flags & PERF_FLAG_FD_NO_GROUP) 6105 if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6496,8 +6114,6 @@ SYSCALL_DEFINE5(perf_event_open,
6496 } 6114 }
6497 } 6115 }
6498 6116
6499 get_online_cpus();
6500
6501 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 6117 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6502 NULL, NULL); 6118 NULL, NULL);
6503 if (IS_ERR(event)) { 6119 if (IS_ERR(event)) {
@@ -6515,7 +6131,7 @@ SYSCALL_DEFINE5(perf_event_open,
6515 * - that may need work on context switch 6131 * - that may need work on context switch
6516 */ 6132 */
6517 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6133 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6518 static_key_slow_inc(&perf_sched_events.key); 6134 jump_label_inc(&perf_sched_events);
6519 } 6135 }
6520 6136
6521 /* 6137 /*
@@ -6550,7 +6166,7 @@ SYSCALL_DEFINE5(perf_event_open,
6550 /* 6166 /*
6551 * Get the target context (task or percpu): 6167 * Get the target context (task or percpu):
6552 */ 6168 */
6553 ctx = find_get_context(pmu, task, event->cpu); 6169 ctx = find_get_context(pmu, task, cpu);
6554 if (IS_ERR(ctx)) { 6170 if (IS_ERR(ctx)) {
6555 err = PTR_ERR(ctx); 6171 err = PTR_ERR(ctx);
6556 goto err_alloc; 6172 goto err_alloc;
@@ -6618,27 +6234,25 @@ SYSCALL_DEFINE5(perf_event_open,
6618 put_ctx(gctx); 6234 put_ctx(gctx);
6619 } 6235 }
6620 6236
6237 event->filp = event_file;
6621 WARN_ON_ONCE(ctx->parent_ctx); 6238 WARN_ON_ONCE(ctx->parent_ctx);
6622 mutex_lock(&ctx->mutex); 6239 mutex_lock(&ctx->mutex);
6623 6240
6624 if (move_group) { 6241 if (move_group) {
6625 synchronize_rcu(); 6242 perf_install_in_context(ctx, group_leader, cpu);
6626 perf_install_in_context(ctx, group_leader, event->cpu);
6627 get_ctx(ctx); 6243 get_ctx(ctx);
6628 list_for_each_entry(sibling, &group_leader->sibling_list, 6244 list_for_each_entry(sibling, &group_leader->sibling_list,
6629 group_entry) { 6245 group_entry) {
6630 perf_install_in_context(ctx, sibling, event->cpu); 6246 perf_install_in_context(ctx, sibling, cpu);
6631 get_ctx(ctx); 6247 get_ctx(ctx);
6632 } 6248 }
6633 } 6249 }
6634 6250
6635 perf_install_in_context(ctx, event, event->cpu); 6251 perf_install_in_context(ctx, event, cpu);
6636 ++ctx->generation; 6252 ++ctx->generation;
6637 perf_unpin_context(ctx); 6253 perf_unpin_context(ctx);
6638 mutex_unlock(&ctx->mutex); 6254 mutex_unlock(&ctx->mutex);
6639 6255
6640 put_online_cpus();
6641
6642 event->owner = current; 6256 event->owner = current;
6643 6257
6644 mutex_lock(&current->perf_event_mutex); 6258 mutex_lock(&current->perf_event_mutex);
@@ -6657,7 +6271,7 @@ SYSCALL_DEFINE5(perf_event_open,
6657 * of the group leader will find the pointer to itself in 6271 * of the group leader will find the pointer to itself in
6658 * perf_group_detach(). 6272 * perf_group_detach().
6659 */ 6273 */
6660 fdput(group); 6274 fput_light(group_file, fput_needed);
6661 fd_install(event_fd, event_file); 6275 fd_install(event_fd, event_file);
6662 return event_fd; 6276 return event_fd;
6663 6277
@@ -6667,11 +6281,10 @@ err_context:
6667err_alloc: 6281err_alloc:
6668 free_event(event); 6282 free_event(event);
6669err_task: 6283err_task:
6670 put_online_cpus();
6671 if (task) 6284 if (task)
6672 put_task_struct(task); 6285 put_task_struct(task);
6673err_group_fd: 6286err_group_fd:
6674 fdput(group); 6287 fput_light(group_file, fput_needed);
6675err_fd: 6288err_fd:
6676 put_unused_fd(event_fd); 6289 put_unused_fd(event_fd);
6677 return err; 6290 return err;
@@ -6711,6 +6324,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6711 goto err_free; 6324 goto err_free;
6712 } 6325 }
6713 6326
6327 event->filp = NULL;
6714 WARN_ON_ONCE(ctx->parent_ctx); 6328 WARN_ON_ONCE(ctx->parent_ctx);
6715 mutex_lock(&ctx->mutex); 6329 mutex_lock(&ctx->mutex);
6716 perf_install_in_context(ctx, event, cpu); 6330 perf_install_in_context(ctx, event, cpu);
@@ -6727,39 +6341,6 @@ err:
6727} 6341}
6728EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 6342EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6729 6343
6730void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
6731{
6732 struct perf_event_context *src_ctx;
6733 struct perf_event_context *dst_ctx;
6734 struct perf_event *event, *tmp;
6735 LIST_HEAD(events);
6736
6737 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
6738 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
6739
6740 mutex_lock(&src_ctx->mutex);
6741 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
6742 event_entry) {
6743 perf_remove_from_context(event);
6744 put_ctx(src_ctx);
6745 list_add(&event->event_entry, &events);
6746 }
6747 mutex_unlock(&src_ctx->mutex);
6748
6749 synchronize_rcu();
6750
6751 mutex_lock(&dst_ctx->mutex);
6752 list_for_each_entry_safe(event, tmp, &events, event_entry) {
6753 list_del(&event->event_entry);
6754 if (event->state >= PERF_EVENT_STATE_OFF)
6755 event->state = PERF_EVENT_STATE_INACTIVE;
6756 perf_install_in_context(dst_ctx, event, dst_cpu);
6757 get_ctx(dst_ctx);
6758 }
6759 mutex_unlock(&dst_ctx->mutex);
6760}
6761EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
6762
6763static void sync_child_event(struct perf_event *child_event, 6344static void sync_child_event(struct perf_event *child_event,
6764 struct task_struct *child) 6345 struct task_struct *child)
6765{ 6346{
@@ -6792,7 +6373,7 @@ static void sync_child_event(struct perf_event *child_event,
6792 * Release the parent event, if this was the last 6373 * Release the parent event, if this was the last
6793 * reference to it. 6374 * reference to it.
6794 */ 6375 */
6795 put_event(parent_event); 6376 fput(parent_event->filp);
6796} 6377}
6797 6378
6798static void 6379static void
@@ -6868,8 +6449,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6868 * 6449 *
6869 * __perf_event_exit_task() 6450 * __perf_event_exit_task()
6870 * sync_child_event() 6451 * sync_child_event()
6871 * put_event() 6452 * fput(parent_event->filp)
6872 * mutex_lock(&ctx->mutex) 6453 * perf_release()
6454 * mutex_lock(&ctx->mutex)
6873 * 6455 *
6874 * But since its the parent context it won't be the same instance. 6456 * But since its the parent context it won't be the same instance.
6875 */ 6457 */
@@ -6937,7 +6519,7 @@ static void perf_free_event(struct perf_event *event,
6937 list_del_init(&event->child_list); 6519 list_del_init(&event->child_list);
6938 mutex_unlock(&parent->child_mutex); 6520 mutex_unlock(&parent->child_mutex);
6939 6521
6940 put_event(parent); 6522 fput(parent->filp);
6941 6523
6942 perf_group_detach(event); 6524 perf_group_detach(event);
6943 list_del_event(event, ctx); 6525 list_del_event(event, ctx);
@@ -7017,12 +6599,6 @@ inherit_event(struct perf_event *parent_event,
7017 NULL, NULL); 6599 NULL, NULL);
7018 if (IS_ERR(child_event)) 6600 if (IS_ERR(child_event))
7019 return child_event; 6601 return child_event;
7020
7021 if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
7022 free_event(child_event);
7023 return NULL;
7024 }
7025
7026 get_ctx(child_ctx); 6602 get_ctx(child_ctx);
7027 6603
7028 /* 6604 /*
@@ -7064,6 +6640,14 @@ inherit_event(struct perf_event *parent_event,
7064 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 6640 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7065 6641
7066 /* 6642 /*
6643 * Get a reference to the parent filp - we will fput it
6644 * when the child event exits. This is safe to do because
6645 * we are in the parent and we know that the filp still
6646 * exists and has a nonzero count:
6647 */
6648 atomic_long_inc(&parent_event->filp->f_count);
6649
6650 /*
7067 * Link this into the parent event's child list 6651 * Link this into the parent event's child list
7068 */ 6652 */
7069 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 6653 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
@@ -7393,16 +6977,6 @@ void __init perf_event_init(void)
7393 6977
7394 ret = init_hw_breakpoint(); 6978 ret = init_hw_breakpoint();
7395 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6979 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
7396
7397 /* do not patch jump label more than once per second */
7398 jump_label_rate_limit(&perf_sched_events, HZ);
7399
7400 /*
7401 * Build time assertion that we keep the data_head at the intended
7402 * location. IOW, validation we got the __reserved[] size right.
7403 */
7404 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7405 != 1024);
7406} 6980}
7407 6981
7408static int __init perf_event_sysfs_init(void) 6982static int __init perf_event_sysfs_init(void)
@@ -7434,7 +7008,8 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7008device_initcall(perf_event_sysfs_init);
7435 7009
7436#ifdef CONFIG_CGROUP_PERF 7010#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7011static struct cgroup_subsys_state *perf_cgroup_create(
7012 struct cgroup_subsys *ss, struct cgroup *cont)
7438{ 7013{
7439 struct perf_cgroup *jc; 7014 struct perf_cgroup *jc;
7440 7015
@@ -7451,7 +7026,8 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7451 return &jc->css; 7026 return &jc->css;
7452} 7027}
7453 7028
7454static void perf_cgroup_css_free(struct cgroup *cont) 7029static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7030 struct cgroup *cont)
7455{ 7031{
7456 struct perf_cgroup *jc; 7032 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7033 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7467,16 +7043,14 @@ static int __perf_cgroup_move(void *info)
7467 return 0; 7043 return 0;
7468} 7044}
7469 7045
7470static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7046static void
7047perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
7471{ 7048{
7472 struct task_struct *task; 7049 task_function_call(task, __perf_cgroup_move, task);
7473
7474 cgroup_taskset_for_each(task, cgrp, tset)
7475 task_function_call(task, __perf_cgroup_move, task);
7476} 7050}
7477 7051
7478static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7052static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7479 struct task_struct *task) 7053 struct cgroup *old_cgrp, struct task_struct *task)
7480{ 7054{
7481 /* 7055 /*
7482 * cgroup_exit() is called in the copy_process() failure path. 7056 * cgroup_exit() is called in the copy_process() failure path.
@@ -7486,22 +7060,15 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7486 if (!(task->flags & PF_EXITING)) 7060 if (!(task->flags & PF_EXITING))
7487 return; 7061 return;
7488 7062
7489 task_function_call(task, __perf_cgroup_move, task); 7063 perf_cgroup_attach_task(cgrp, task);
7490} 7064}
7491 7065
7492struct cgroup_subsys perf_subsys = { 7066struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7067 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7068 .subsys_id = perf_subsys_id,
7495 .css_alloc = perf_cgroup_css_alloc, 7069 .create = perf_cgroup_create,
7496 .css_free = perf_cgroup_css_free, 7070 .destroy = perf_cgroup_destroy,
7497 .exit = perf_cgroup_exit, 7071 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7072 .attach_task = perf_cgroup_attach_task,
7499
7500 /*
7501 * perf_event cgroup doesn't handle nesting correctly.
7502 * ctx->nr_cgroups adjustments should be propagated through the
7503 * cgroup hierarchy. Fix it and remove the following.
7504 */
7505 .broken_hierarchy = true,
7506}; 7073};
7507#endif /* CONFIG_CGROUP_PERF */ 7074#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507e..b7971d6f38b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,16 +111,14 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
111 * Count the number of breakpoints of the same type and same task. 111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list. 112 * The given event must be not on the list.
113 */ 113 */
114static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct task_struct *tsk = bp->hw.bp_target; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
122 find_slot_idx(iter) == type &&
123 cpu == iter->cpu)
124 count += hw_breakpoint_weight(iter); 122 count += hw_breakpoint_weight(iter);
125 } 123 }
126 124
@@ -143,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
143 if (!tsk) 141 if (!tsk)
144 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
145 else 143 else
146 slots->pinned += task_bp_pinned(cpu, bp, type); 144 slots->pinned += task_bp_pinned(bp, type);
147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
148 146
149 return; 147 return;
@@ -156,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
156 if (!tsk) 154 if (!tsk)
157 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
158 else 156 else
159 nr += task_bp_pinned(cpu, bp, type); 157 nr += task_bp_pinned(bp, type);
160 158
161 if (nr > slots->pinned) 159 if (nr > slots->pinned)
162 slots->pinned = nr; 160 slots->pinned = nr;
@@ -190,7 +188,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
190 int old_idx = 0; 188 int old_idx = 0;
191 int idx = 0; 189 int idx = 0;
192 190
193 old_count = task_bp_pinned(cpu, bp, type); 191 old_count = task_bp_pinned(bp, type);
194 old_idx = old_count - 1; 192 old_idx = old_count - 1;
195 idx = old_idx + weight; 193 idx = old_idx + weight;
196 194
@@ -455,16 +453,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
455 int old_type = bp->attr.bp_type; 453 int old_type = bp->attr.bp_type;
456 int err = 0; 454 int err = 0;
457 455
458 /* 456 perf_event_disable(bp);
459 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
460 * will not be possible to raise IPIs that invoke __perf_event_disable.
461 * So call the function directly after making sure we are targeting the
462 * current task.
463 */
464 if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
465 __perf_event_disable(bp);
466 else
467 perf_event_disable(bp);
468 457
469 bp->attr.bp_addr = attr->bp_addr; 458 bp->attr.bp_addr = attr->bp_addr;
470 bp->attr.bp_type = attr->bp_type; 459 bp->attr.bp_type = attr->bp_type;
@@ -592,12 +581,6 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
592 if (bp->attr.type != PERF_TYPE_BREAKPOINT) 581 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
593 return -ENOENT; 582 return -ENOENT;
594 583
595 /*
596 * no branch sampling for breakpoint events
597 */
598 if (has_branch_stack(bp))
599 return -EOPNOTSUPP;
600
601 err = register_perf_hw_breakpoint(bp); 584 err = register_perf_hw_breakpoint(bp);
602 if (err) 585 if (err)
603 return err; 586 return err;
@@ -630,11 +613,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
630 bp->hw.state = PERF_HES_STOPPED; 613 bp->hw.state = PERF_HES_STOPPED;
631} 614}
632 615
633static int hw_breakpoint_event_idx(struct perf_event *bp)
634{
635 return 0;
636}
637
638static struct pmu perf_breakpoint = { 616static struct pmu perf_breakpoint = {
639 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 617 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
640 618
@@ -644,8 +622,6 @@ static struct pmu perf_breakpoint = {
644 .start = hw_breakpoint_start, 622 .start = hw_breakpoint_start,
645 .stop = hw_breakpoint_stop, 623 .stop = hw_breakpoint_stop,
646 .read = hw_breakpoint_pmu_read, 624 .read = hw_breakpoint_pmu_read,
647
648 .event_idx = hw_breakpoint_event_idx,
649}; 625};
650 626
651int __init init_hw_breakpoint(void) 627int __init init_hw_breakpoint(void)
@@ -675,10 +651,10 @@ int __init init_hw_breakpoint(void)
675 651
676 err_alloc: 652 err_alloc:
677 for_each_possible_cpu(err_cpu) { 653 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
680 if (err_cpu == cpu) 654 if (err_cpu == cpu)
681 break; 655 break;
656 for (i = 0; i < TYPE_MAX; i++)
657 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
682 } 658 }
683 659
684 return -ENOMEM; 660 return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8..09097dd8116 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,11 +1,6 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5#include <linux/uaccess.h>
6
7/* Buffer handling */
8
9#define RING_BUFFER_WRITABLE 0x01 4#define RING_BUFFER_WRITABLE 0x01
10 5
11struct ring_buffer { 6struct ring_buffer {
@@ -27,9 +22,6 @@ struct ring_buffer {
27 local_t lost; /* nr records lost */ 22 local_t lost; /* nr records lost */
28 23
29 long watermark; /* wakeup watermark */ 24 long watermark; /* wakeup watermark */
30 /* poll crap */
31 spinlock_t event_lock;
32 struct list_head event_list;
33 25
34 struct perf_event_mmap_page *user_page; 26 struct perf_event_mmap_page *user_page;
35 void *data_pages[0]; 27 void *data_pages[0];
@@ -72,106 +64,33 @@ static inline int page_order(struct ring_buffer *rb)
72} 64}
73#endif 65#endif
74 66
75static inline unsigned long perf_data_size(struct ring_buffer *rb) 67static unsigned long perf_data_size(struct ring_buffer *rb)
76{ 68{
77 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
78} 70}
79 71
80#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 72static inline void
81static inline unsigned int \ 73__output_copy(struct perf_output_handle *handle,
82func_name(struct perf_output_handle *handle, \ 74 const void *buf, unsigned int len)
83 const void *buf, unsigned int len) \
84{ \
85 unsigned long size, written; \
86 \
87 do { \
88 size = min_t(unsigned long, handle->size, len); \
89 \
90 written = memcpy_func(handle->addr, buf, size); \
91 \
92 len -= written; \
93 handle->addr += written; \
94 buf += written; \
95 handle->size -= written; \
96 if (!handle->size) { \
97 struct ring_buffer *rb = handle->rb; \
98 \
99 handle->page++; \
100 handle->page &= rb->nr_pages - 1; \
101 handle->addr = rb->data_pages[handle->page]; \
102 handle->size = PAGE_SIZE << page_order(rb); \
103 } \
104 } while (len && written == size); \
105 \
106 return len; \
107}
108
109static inline int memcpy_common(void *dst, const void *src, size_t n)
110{ 75{
111 memcpy(dst, src, n); 76 do {
112 return n; 77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
113} 94}
114 95
115DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
116
117#define MEMCPY_SKIP(dst, src, n) (n)
118
119DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
120
121#ifndef arch_perf_out_copy_user
122#define arch_perf_out_copy_user __copy_from_user_inatomic
123#endif
124
125DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
126
127/* Callchain handling */
128extern struct perf_callchain_entry *
129perf_callchain(struct perf_event *event, struct pt_regs *regs);
130extern int get_callchain_buffers(void);
131extern void put_callchain_buffers(void);
132
133static inline int get_recursion_context(int *recursion)
134{
135 int rctx;
136
137 if (in_nmi())
138 rctx = 3;
139 else if (in_irq())
140 rctx = 2;
141 else if (in_softirq())
142 rctx = 1;
143 else
144 rctx = 0;
145
146 if (recursion[rctx])
147 return -1;
148
149 recursion[rctx]++;
150 barrier();
151
152 return rctx;
153}
154
155static inline void put_recursion_context(int *recursion, int rctx)
156{
157 barrier();
158 recursion[rctx]--;
159}
160
161#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
162static inline bool arch_perf_have_user_stack_dump(void)
163{
164 return true;
165}
166
167#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
168#else
169static inline bool arch_perf_have_user_stack_dump(void)
170{
171 return false;
172}
173
174#define perf_user_stack_pointer(regs) 0
175#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
176
177#endif /* _KERNEL_EVENTS_INTERNAL_H */ 96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff397..a2a29205cc0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -182,16 +182,10 @@ out:
182 return -ENOSPC; 182 return -ENOSPC;
183} 183}
184 184
185unsigned int perf_output_copy(struct perf_output_handle *handle, 185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len) 186 const void *buf, unsigned int len)
187{ 187{
188 return __output_copy(handle, buf, len); 188 __output_copy(handle, buf, len);
189}
190
191unsigned int perf_output_skip(struct perf_output_handle *handle,
192 unsigned int len)
193{
194 return __output_skip(handle, NULL, len);
195} 189}
196 190
197void perf_output_end(struct perf_output_handle *handle) 191void perf_output_end(struct perf_output_handle *handle)
@@ -215,9 +209,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
215 rb->writable = 1; 209 rb->writable = 1;
216 210
217 atomic_set(&rb->refcount, 1); 211 atomic_set(&rb->refcount, 1);
218
219 INIT_LIST_HEAD(&rb->event_list);
220 spin_lock_init(&rb->event_lock);
221} 212}
222 213
223#ifndef CONFIG_PERF_USE_VMALLOC 214#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
deleted file mode 100644
index dea7acfbb07..00000000000
--- a/kernel/events/uprobes.c
+++ /dev/null
@@ -1,1627 +0,0 @@
1/*
2 * User-space Probes (UProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2012
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 */
24
25#include <linux/kernel.h>
26#include <linux/highmem.h>
27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h>
29#include <linux/sched.h>
30#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */
36#include <linux/percpu-rwsem.h>
37
38#include <linux/uprobes.h>
39
40#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
41#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
42
43static struct rb_root uprobes_tree = RB_ROOT;
44
45static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46
47#define UPROBES_HASH_SZ 13
48
49/*
50 * We need separate register/unregister and mmap/munmap lock hashes because
51 * of mmap_sem nesting.
52 *
53 * uprobe_register() needs to install probes on (potentially) all processes
54 * and thus needs to acquire multiple mmap_sems (consequtively, not
55 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
56 * for the particular process doing the mmap.
57 *
58 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
59 * because of lock order against i_mmap_mutex. This means there's a hole in
60 * the register vma iteration where a mmap() can happen.
61 *
62 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
63 * install a probe where one is already installed.
64 */
65
66/* serialize (un)register */
67static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
68
69#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
70
71/* serialize uprobe->pending_list */
72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
74
75static struct percpu_rw_semaphore dup_mmap_sem;
76
77/*
78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
79 * events active at this time. Probably a fine grained per inode count is
80 * better?
81 */
82static atomic_t uprobe_events = ATOMIC_INIT(0);
83
84/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2
90
91struct uprobe {
92 struct rb_node rb_node; /* node in the rb tree */
93 atomic_t ref;
94 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
96 struct list_head pending_list;
97 struct uprobe_consumer *consumers;
98 struct inode *inode; /* Also hold a ref to inode */
99 loff_t offset;
100 unsigned long flags;
101 struct arch_uprobe arch;
102};
103
104/*
105 * valid_vma: Verify if the specified vma is an executable vma
106 * Relax restrictions while unregistering: vm_flags might have
107 * changed after breakpoint was inserted.
108 * - is_register: indicates if we are in register context.
109 * - Return 1 if the specified virtual address is in an
110 * executable vma.
111 */
112static bool valid_vma(struct vm_area_struct *vma, bool is_register)
113{
114 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
115
116 if (is_register)
117 flags |= VM_WRITE;
118
119 return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
120}
121
122static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
123{
124 return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
125}
126
127static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
128{
129 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
130}
131
132/**
133 * __replace_page - replace page in vma by new page.
134 * based on replace_page in mm/ksm.c
135 *
136 * @vma: vma that holds the pte pointing to page
137 * @addr: address the old @page is mapped at
138 * @page: the cowed page we are replacing by kpage
139 * @kpage: the modified page we replace page by
140 *
141 * Returns 0 on success, -EFAULT on failure.
142 */
143static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
144 struct page *page, struct page *kpage)
145{
146 struct mm_struct *mm = vma->vm_mm;
147 spinlock_t *ptl;
148 pte_t *ptep;
149 int err;
150 /* For mmu_notifiers */
151 const unsigned long mmun_start = addr;
152 const unsigned long mmun_end = addr + PAGE_SIZE;
153
154 /* For try_to_free_swap() and munlock_vma_page() below */
155 lock_page(page);
156
157 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
158 err = -EAGAIN;
159 ptep = page_check_address(page, mm, addr, &ptl, 0);
160 if (!ptep)
161 goto unlock;
162
163 get_page(kpage);
164 page_add_new_anon_rmap(kpage, vma, addr);
165
166 if (!PageAnon(page)) {
167 dec_mm_counter(mm, MM_FILEPAGES);
168 inc_mm_counter(mm, MM_ANONPAGES);
169 }
170
171 flush_cache_page(vma, addr, pte_pfn(*ptep));
172 ptep_clear_flush(vma, addr, ptep);
173 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
174
175 page_remove_rmap(page);
176 if (!page_mapped(page))
177 try_to_free_swap(page);
178 pte_unmap_unlock(ptep, ptl);
179
180 if (vma->vm_flags & VM_LOCKED)
181 munlock_vma_page(page);
182 put_page(page);
183
184 err = 0;
185 unlock:
186 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
187 unlock_page(page);
188 return err;
189}
190
191/**
192 * is_swbp_insn - check if instruction is breakpoint instruction.
193 * @insn: instruction to be checked.
194 * Default implementation of is_swbp_insn
195 * Returns true if @insn is a breakpoint instruction.
196 */
197bool __weak is_swbp_insn(uprobe_opcode_t *insn)
198{
199 return *insn == UPROBE_SWBP_INSN;
200}
201
202static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
203{
204 void *kaddr = kmap_atomic(page);
205 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
206 kunmap_atomic(kaddr);
207}
208
209static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
210{
211 uprobe_opcode_t old_opcode;
212 bool is_swbp;
213
214 copy_opcode(page, vaddr, &old_opcode);
215 is_swbp = is_swbp_insn(&old_opcode);
216
217 if (is_swbp_insn(new_opcode)) {
218 if (is_swbp) /* register: already installed? */
219 return 0;
220 } else {
221 if (!is_swbp) /* unregister: was it changed by us? */
222 return 0;
223 }
224
225 return 1;
226}
227
228/*
229 * NOTE:
230 * Expect the breakpoint instruction to be the smallest size instruction for
231 * the architecture. If an arch has variable length instruction and the
232 * breakpoint instruction is not of the smallest length instruction
233 * supported by that architecture then we need to modify is_swbp_at_addr and
234 * write_opcode accordingly. This would never be a problem for archs that
235 * have fixed length instructions.
236 */
237
238/*
239 * write_opcode - write the opcode at a given virtual address.
240 * @mm: the probed process address space.
241 * @vaddr: the virtual address to store the opcode.
242 * @opcode: opcode to be written at @vaddr.
243 *
244 * Called with mm->mmap_sem held (for read and with a reference to
245 * mm).
246 *
247 * For mm @mm, write the opcode at @vaddr.
248 * Return 0 (success) or a negative errno.
249 */
250static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
251 uprobe_opcode_t opcode)
252{
253 struct page *old_page, *new_page;
254 void *vaddr_old, *vaddr_new;
255 struct vm_area_struct *vma;
256 int ret;
257
258retry:
259 /* Read the page with vaddr into memory */
260 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
261 if (ret <= 0)
262 return ret;
263
264 ret = verify_opcode(old_page, vaddr, &opcode);
265 if (ret <= 0)
266 goto put_old;
267
268 ret = -ENOMEM;
269 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
270 if (!new_page)
271 goto put_old;
272
273 __SetPageUptodate(new_page);
274
275 /* copy the page now that we've got it stable */
276 vaddr_old = kmap_atomic(old_page);
277 vaddr_new = kmap_atomic(new_page);
278
279 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
280 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
281
282 kunmap_atomic(vaddr_new);
283 kunmap_atomic(vaddr_old);
284
285 ret = anon_vma_prepare(vma);
286 if (ret)
287 goto put_new;
288
289 ret = __replace_page(vma, vaddr, old_page, new_page);
290
291put_new:
292 page_cache_release(new_page);
293put_old:
294 put_page(old_page);
295
296 if (unlikely(ret == -EAGAIN))
297 goto retry;
298 return ret;
299}
300
301/**
302 * set_swbp - store breakpoint at a given address.
303 * @auprobe: arch specific probepoint information.
304 * @mm: the probed process address space.
305 * @vaddr: the virtual address to insert the opcode.
306 *
307 * For mm @mm, store the breakpoint instruction at @vaddr.
308 * Return 0 (success) or a negative errno.
309 */
310int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
311{
312 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
313}
314
315/**
316 * set_orig_insn - Restore the original instruction.
317 * @mm: the probed process address space.
318 * @auprobe: arch specific probepoint information.
319 * @vaddr: the virtual address to insert the opcode.
320 *
321 * For mm @mm, restore the original opcode (opcode) at @vaddr.
322 * Return 0 (success) or a negative errno.
323 */
324int __weak
325set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
326{
327 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
328}
329
330static int match_uprobe(struct uprobe *l, struct uprobe *r)
331{
332 if (l->inode < r->inode)
333 return -1;
334
335 if (l->inode > r->inode)
336 return 1;
337
338 if (l->offset < r->offset)
339 return -1;
340
341 if (l->offset > r->offset)
342 return 1;
343
344 return 0;
345}
346
347static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
348{
349 struct uprobe u = { .inode = inode, .offset = offset };
350 struct rb_node *n = uprobes_tree.rb_node;
351 struct uprobe *uprobe;
352 int match;
353
354 while (n) {
355 uprobe = rb_entry(n, struct uprobe, rb_node);
356 match = match_uprobe(&u, uprobe);
357 if (!match) {
358 atomic_inc(&uprobe->ref);
359 return uprobe;
360 }
361
362 if (match < 0)
363 n = n->rb_left;
364 else
365 n = n->rb_right;
366 }
367 return NULL;
368}
369
370/*
371 * Find a uprobe corresponding to a given inode:offset
372 * Acquires uprobes_treelock
373 */
374static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
375{
376 struct uprobe *uprobe;
377
378 spin_lock(&uprobes_treelock);
379 uprobe = __find_uprobe(inode, offset);
380 spin_unlock(&uprobes_treelock);
381
382 return uprobe;
383}
384
385static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
386{
387 struct rb_node **p = &uprobes_tree.rb_node;
388 struct rb_node *parent = NULL;
389 struct uprobe *u;
390 int match;
391
392 while (*p) {
393 parent = *p;
394 u = rb_entry(parent, struct uprobe, rb_node);
395 match = match_uprobe(uprobe, u);
396 if (!match) {
397 atomic_inc(&u->ref);
398 return u;
399 }
400
401 if (match < 0)
402 p = &parent->rb_left;
403 else
404 p = &parent->rb_right;
405
406 }
407
408 u = NULL;
409 rb_link_node(&uprobe->rb_node, parent, p);
410 rb_insert_color(&uprobe->rb_node, &uprobes_tree);
411 /* get access + creation ref */
412 atomic_set(&uprobe->ref, 2);
413
414 return u;
415}
416
417/*
418 * Acquire uprobes_treelock.
419 * Matching uprobe already exists in rbtree;
420 * increment (access refcount) and return the matching uprobe.
421 *
422 * No matching uprobe; insert the uprobe in rb_tree;
423 * get a double refcount (access + creation) and return NULL.
424 */
425static struct uprobe *insert_uprobe(struct uprobe *uprobe)
426{
427 struct uprobe *u;
428
429 spin_lock(&uprobes_treelock);
430 u = __insert_uprobe(uprobe);
431 spin_unlock(&uprobes_treelock);
432
433 /* For now assume that the instruction need not be single-stepped */
434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
435
436 return u;
437}
438
439static void put_uprobe(struct uprobe *uprobe)
440{
441 if (atomic_dec_and_test(&uprobe->ref))
442 kfree(uprobe);
443}
444
445static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
446{
447 struct uprobe *uprobe, *cur_uprobe;
448
449 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
450 if (!uprobe)
451 return NULL;
452
453 uprobe->inode = igrab(inode);
454 uprobe->offset = offset;
455 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex);
457
458 /* add to uprobes_tree, sorted on inode:offset */
459 cur_uprobe = insert_uprobe(uprobe);
460
461 /* a uprobe exists for this inode:offset combination */
462 if (cur_uprobe) {
463 kfree(uprobe);
464 uprobe = cur_uprobe;
465 iput(inode);
466 } else {
467 atomic_inc(&uprobe_events);
468 }
469
470 return uprobe;
471}
472
473static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
474{
475 struct uprobe_consumer *uc;
476
477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
478 return;
479
480 down_read(&uprobe->consumer_rwsem);
481 for (uc = uprobe->consumers; uc; uc = uc->next) {
482 if (!uc->filter || uc->filter(uc, current))
483 uc->handler(uc, regs);
484 }
485 up_read(&uprobe->consumer_rwsem);
486}
487
488/* Returns the previous consumer */
489static struct uprobe_consumer *
490consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
491{
492 down_write(&uprobe->consumer_rwsem);
493 uc->next = uprobe->consumers;
494 uprobe->consumers = uc;
495 up_write(&uprobe->consumer_rwsem);
496
497 return uc->next;
498}
499
500/*
501 * For uprobe @uprobe, delete the consumer @uc.
502 * Return true if the @uc is deleted successfully
503 * or return false.
504 */
505static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
506{
507 struct uprobe_consumer **con;
508 bool ret = false;
509
510 down_write(&uprobe->consumer_rwsem);
511 for (con = &uprobe->consumers; *con; con = &(*con)->next) {
512 if (*con == uc) {
513 *con = uc->next;
514 ret = true;
515 break;
516 }
517 }
518 up_write(&uprobe->consumer_rwsem);
519
520 return ret;
521}
522
523static int
524__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
525 unsigned long nbytes, loff_t offset)
526{
527 struct page *page;
528 void *vaddr;
529 unsigned long off;
530 pgoff_t idx;
531
532 if (!filp)
533 return -EINVAL;
534
535 if (!mapping->a_ops->readpage)
536 return -EIO;
537
538 idx = offset >> PAGE_CACHE_SHIFT;
539 off = offset & ~PAGE_MASK;
540
541 /*
542 * Ensure that the page that has the original instruction is
543 * populated and in page-cache.
544 */
545 page = read_mapping_page(mapping, idx, filp);
546 if (IS_ERR(page))
547 return PTR_ERR(page);
548
549 vaddr = kmap_atomic(page);
550 memcpy(insn, vaddr + off, nbytes);
551 kunmap_atomic(vaddr);
552 page_cache_release(page);
553
554 return 0;
555}
556
557static int copy_insn(struct uprobe *uprobe, struct file *filp)
558{
559 struct address_space *mapping;
560 unsigned long nbytes;
561 int bytes;
562
563 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
564 mapping = uprobe->inode->i_mapping;
565
566 /* Instruction at end of binary; copy only available bytes */
567 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
568 bytes = uprobe->inode->i_size - uprobe->offset;
569 else
570 bytes = MAX_UINSN_BYTES;
571
572 /* Instruction at the page-boundary; copy bytes in second page */
573 if (nbytes < bytes) {
574 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
575 bytes - nbytes, uprobe->offset + nbytes);
576 if (err)
577 return err;
578 bytes = nbytes;
579 }
580 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
581}
582
583static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
584 struct mm_struct *mm, unsigned long vaddr)
585{
586 int ret = 0;
587
588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
589 return ret;
590
591 mutex_lock(&uprobe->copy_mutex);
592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
593 goto out;
594
595 ret = copy_insn(uprobe, file);
596 if (ret)
597 goto out;
598
599 ret = -ENOTSUPP;
600 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
601 goto out;
602
603 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
604 if (ret)
605 goto out;
606
607 /* write_opcode() assumes we don't cross page boundary */
608 BUG_ON((uprobe->offset & ~PAGE_MASK) +
609 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
610
611 smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613
614 out:
615 mutex_unlock(&uprobe->copy_mutex);
616
617 return ret;
618}
619
620static int
621install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
622 struct vm_area_struct *vma, unsigned long vaddr)
623{
624 bool first_uprobe;
625 int ret;
626
627 /*
628 * If probe is being deleted, unregister thread could be done with
629 * the vma-rmap-walk through. Adding a probe now can be fatal since
630 * nobody will be able to cleanup. Also we could be from fork or
631 * mremap path, where the probe might have already been inserted.
632 * Hence behave as if probe already existed.
633 */
634 if (!uprobe->consumers)
635 return 0;
636
637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
638 if (ret)
639 return ret;
640
641 /*
642 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
643 * the task can hit this breakpoint right after __replace_page().
644 */
645 first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
646 if (first_uprobe)
647 set_bit(MMF_HAS_UPROBES, &mm->flags);
648
649 ret = set_swbp(&uprobe->arch, mm, vaddr);
650 if (!ret)
651 clear_bit(MMF_RECALC_UPROBES, &mm->flags);
652 else if (first_uprobe)
653 clear_bit(MMF_HAS_UPROBES, &mm->flags);
654
655 return ret;
656}
657
658static int
659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
660{
661 /* can happen if uprobe_register() fails */
662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr);
667}
668
669/*
670 * There could be threads that have already hit the breakpoint. They
671 * will recheck the current insn and restart if find_uprobe() fails.
672 * See find_active_uprobe().
673 */
674static void delete_uprobe(struct uprobe *uprobe)
675{
676 spin_lock(&uprobes_treelock);
677 rb_erase(&uprobe->rb_node, &uprobes_tree);
678 spin_unlock(&uprobes_treelock);
679 iput(uprobe->inode);
680 put_uprobe(uprobe);
681 atomic_dec(&uprobe_events);
682}
683
684struct map_info {
685 struct map_info *next;
686 struct mm_struct *mm;
687 unsigned long vaddr;
688};
689
690static inline struct map_info *free_map_info(struct map_info *info)
691{
692 struct map_info *next = info->next;
693 kfree(info);
694 return next;
695}
696
697static struct map_info *
698build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
699{
700 unsigned long pgoff = offset >> PAGE_SHIFT;
701 struct vm_area_struct *vma;
702 struct map_info *curr = NULL;
703 struct map_info *prev = NULL;
704 struct map_info *info;
705 int more = 0;
706
707 again:
708 mutex_lock(&mapping->i_mmap_mutex);
709 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
710 if (!valid_vma(vma, is_register))
711 continue;
712
713 if (!prev && !more) {
714 /*
715 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
716 * reclaim. This is optimistic, no harm done if it fails.
717 */
718 prev = kmalloc(sizeof(struct map_info),
719 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
720 if (prev)
721 prev->next = NULL;
722 }
723 if (!prev) {
724 more++;
725 continue;
726 }
727
728 if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
729 continue;
730
731 info = prev;
732 prev = prev->next;
733 info->next = curr;
734 curr = info;
735
736 info->mm = vma->vm_mm;
737 info->vaddr = offset_to_vaddr(vma, offset);
738 }
739 mutex_unlock(&mapping->i_mmap_mutex);
740
741 if (!more)
742 goto out;
743
744 prev = curr;
745 while (curr) {
746 mmput(curr->mm);
747 curr = curr->next;
748 }
749
750 do {
751 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
752 if (!info) {
753 curr = ERR_PTR(-ENOMEM);
754 goto out;
755 }
756 info->next = prev;
757 prev = info;
758 } while (--more);
759
760 goto again;
761 out:
762 while (prev)
763 prev = free_map_info(prev);
764 return curr;
765}
766
767static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
768{
769 struct map_info *info;
770 int err = 0;
771
772 percpu_down_write(&dup_mmap_sem);
773 info = build_map_info(uprobe->inode->i_mapping,
774 uprobe->offset, is_register);
775 if (IS_ERR(info)) {
776 err = PTR_ERR(info);
777 goto out;
778 }
779
780 while (info) {
781 struct mm_struct *mm = info->mm;
782 struct vm_area_struct *vma;
783
784 if (err && is_register)
785 goto free;
786
787 down_write(&mm->mmap_sem);
788 vma = find_vma(mm, info->vaddr);
789 if (!vma || !valid_vma(vma, is_register) ||
790 vma->vm_file->f_mapping->host != uprobe->inode)
791 goto unlock;
792
793 if (vma->vm_start > info->vaddr ||
794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
795 goto unlock;
796
797 if (is_register)
798 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
799 else
800 err |= remove_breakpoint(uprobe, mm, info->vaddr);
801
802 unlock:
803 up_write(&mm->mmap_sem);
804 free:
805 mmput(mm);
806 info = free_map_info(info);
807 }
808 out:
809 percpu_up_write(&dup_mmap_sem);
810 return err;
811}
812
813static int __uprobe_register(struct uprobe *uprobe)
814{
815 return register_for_each_vma(uprobe, true);
816}
817
818static void __uprobe_unregister(struct uprobe *uprobe)
819{
820 if (!register_for_each_vma(uprobe, false))
821 delete_uprobe(uprobe);
822
823 /* TODO : cant unregister? schedule a worker thread */
824}
825
826/*
827 * uprobe_register - register a probe
828 * @inode: the file in which the probe has to be placed.
829 * @offset: offset from the start of the file.
830 * @uc: information on howto handle the probe..
831 *
832 * Apart from the access refcount, uprobe_register() takes a creation
833 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
834 * inserted into the rbtree (i.e first consumer for a @inode:@offset
835 * tuple). Creation refcount stops uprobe_unregister from freeing the
836 * @uprobe even before the register operation is complete. Creation
837 * refcount is released when the last @uc for the @uprobe
838 * unregisters.
839 *
840 * Return errno if it cannot successully install probes
841 * else return 0 (success)
842 */
843int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
844{
845 struct uprobe *uprobe;
846 int ret;
847
848 if (!inode || !uc || uc->next)
849 return -EINVAL;
850
851 if (offset > i_size_read(inode))
852 return -EINVAL;
853
854 ret = 0;
855 mutex_lock(uprobes_hash(inode));
856 uprobe = alloc_uprobe(inode, offset);
857
858 if (!uprobe) {
859 ret = -ENOMEM;
860 } else if (!consumer_add(uprobe, uc)) {
861 ret = __uprobe_register(uprobe);
862 if (ret) {
863 uprobe->consumers = NULL;
864 __uprobe_unregister(uprobe);
865 } else {
866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
867 }
868 }
869
870 mutex_unlock(uprobes_hash(inode));
871 if (uprobe)
872 put_uprobe(uprobe);
873
874 return ret;
875}
876
877/*
878 * uprobe_unregister - unregister a already registered probe.
879 * @inode: the file in which the probe has to be removed.
880 * @offset: offset from the start of the file.
881 * @uc: identify which probe if multiple probes are colocated.
882 */
883void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
884{
885 struct uprobe *uprobe;
886
887 if (!inode || !uc)
888 return;
889
890 uprobe = find_uprobe(inode, offset);
891 if (!uprobe)
892 return;
893
894 mutex_lock(uprobes_hash(inode));
895
896 if (consumer_del(uprobe, uc)) {
897 if (!uprobe->consumers) {
898 __uprobe_unregister(uprobe);
899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
900 }
901 }
902
903 mutex_unlock(uprobes_hash(inode));
904 if (uprobe)
905 put_uprobe(uprobe);
906}
907
908static struct rb_node *
909find_node_in_range(struct inode *inode, loff_t min, loff_t max)
910{
911 struct rb_node *n = uprobes_tree.rb_node;
912
913 while (n) {
914 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
915
916 if (inode < u->inode) {
917 n = n->rb_left;
918 } else if (inode > u->inode) {
919 n = n->rb_right;
920 } else {
921 if (max < u->offset)
922 n = n->rb_left;
923 else if (min > u->offset)
924 n = n->rb_right;
925 else
926 break;
927 }
928 }
929
930 return n;
931}
932
933/*
934 * For a given range in vma, build a list of probes that need to be inserted.
935 */
936static void build_probe_list(struct inode *inode,
937 struct vm_area_struct *vma,
938 unsigned long start, unsigned long end,
939 struct list_head *head)
940{
941 loff_t min, max;
942 struct rb_node *n, *t;
943 struct uprobe *u;
944
945 INIT_LIST_HEAD(head);
946 min = vaddr_to_offset(vma, start);
947 max = min + (end - start) - 1;
948
949 spin_lock(&uprobes_treelock);
950 n = find_node_in_range(inode, min, max);
951 if (n) {
952 for (t = n; t; t = rb_prev(t)) {
953 u = rb_entry(t, struct uprobe, rb_node);
954 if (u->inode != inode || u->offset < min)
955 break;
956 list_add(&u->pending_list, head);
957 atomic_inc(&u->ref);
958 }
959 for (t = n; (t = rb_next(t)); ) {
960 u = rb_entry(t, struct uprobe, rb_node);
961 if (u->inode != inode || u->offset > max)
962 break;
963 list_add(&u->pending_list, head);
964 atomic_inc(&u->ref);
965 }
966 }
967 spin_unlock(&uprobes_treelock);
968}
969
970/*
971 * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
972 *
973 * Currently we ignore all errors and always return 0, the callers
974 * can't handle the failure anyway.
975 */
976int uprobe_mmap(struct vm_area_struct *vma)
977{
978 struct list_head tmp_list;
979 struct uprobe *uprobe, *u;
980 struct inode *inode;
981
982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
983 return 0;
984
985 inode = vma->vm_file->f_mapping->host;
986 if (!inode)
987 return 0;
988
989 mutex_lock(uprobes_mmap_hash(inode));
990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
991
992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
993 if (!fatal_signal_pending(current)) {
994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
996 }
997 put_uprobe(uprobe);
998 }
999 mutex_unlock(uprobes_mmap_hash(inode));
1000
1001 return 0;
1002}
1003
1004static bool
1005vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1006{
1007 loff_t min, max;
1008 struct inode *inode;
1009 struct rb_node *n;
1010
1011 inode = vma->vm_file->f_mapping->host;
1012
1013 min = vaddr_to_offset(vma, start);
1014 max = min + (end - start) - 1;
1015
1016 spin_lock(&uprobes_treelock);
1017 n = find_node_in_range(inode, min, max);
1018 spin_unlock(&uprobes_treelock);
1019
1020 return !!n;
1021}
1022
1023/*
1024 * Called in context of a munmap of a vma.
1025 */
1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1027{
1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1029 return;
1030
1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1032 return;
1033
1034 if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1035 test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1036 return;
1037
1038 if (vma_has_uprobes(vma, start, end))
1039 set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1040}
1041
1042/* Slot allocation for XOL */
1043static int xol_add_vma(struct xol_area *area)
1044{
1045 struct mm_struct *mm;
1046 int ret;
1047
1048 area->page = alloc_page(GFP_HIGHUSER);
1049 if (!area->page)
1050 return -ENOMEM;
1051
1052 ret = -EALREADY;
1053 mm = current->mm;
1054
1055 down_write(&mm->mmap_sem);
1056 if (mm->uprobes_state.xol_area)
1057 goto fail;
1058
1059 ret = -ENOMEM;
1060
1061 /* Try to map as high as possible, this is only a hint. */
1062 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1063 if (area->vaddr & ~PAGE_MASK) {
1064 ret = area->vaddr;
1065 goto fail;
1066 }
1067
1068 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1069 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1070 if (ret)
1071 goto fail;
1072
1073 smp_wmb(); /* pairs with get_xol_area() */
1074 mm->uprobes_state.xol_area = area;
1075 ret = 0;
1076
1077fail:
1078 up_write(&mm->mmap_sem);
1079 if (ret)
1080 __free_page(area->page);
1081
1082 return ret;
1083}
1084
1085static struct xol_area *get_xol_area(struct mm_struct *mm)
1086{
1087 struct xol_area *area;
1088
1089 area = mm->uprobes_state.xol_area;
1090 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1091
1092 return area;
1093}
1094
1095/*
1096 * xol_alloc_area - Allocate process's xol_area.
1097 * This area will be used for storing instructions for execution out of
1098 * line.
1099 *
1100 * Returns the allocated area or NULL.
1101 */
1102static struct xol_area *xol_alloc_area(void)
1103{
1104 struct xol_area *area;
1105
1106 area = kzalloc(sizeof(*area), GFP_KERNEL);
1107 if (unlikely(!area))
1108 return NULL;
1109
1110 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1111
1112 if (!area->bitmap)
1113 goto fail;
1114
1115 init_waitqueue_head(&area->wq);
1116 if (!xol_add_vma(area))
1117 return area;
1118
1119fail:
1120 kfree(area->bitmap);
1121 kfree(area);
1122
1123 return get_xol_area(current->mm);
1124}
1125
1126/*
1127 * uprobe_clear_state - Free the area allocated for slots.
1128 */
1129void uprobe_clear_state(struct mm_struct *mm)
1130{
1131 struct xol_area *area = mm->uprobes_state.xol_area;
1132
1133 if (!area)
1134 return;
1135
1136 put_page(area->page);
1137 kfree(area->bitmap);
1138 kfree(area);
1139}
1140
1141void uprobe_start_dup_mmap(void)
1142{
1143 percpu_down_read(&dup_mmap_sem);
1144}
1145
1146void uprobe_end_dup_mmap(void)
1147{
1148 percpu_up_read(&dup_mmap_sem);
1149}
1150
1151void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1152{
1153 newmm->uprobes_state.xol_area = NULL;
1154
1155 if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1156 set_bit(MMF_HAS_UPROBES, &newmm->flags);
1157 /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1158 set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1159 }
1160}
1161
1162/*
1163 * - search for a free slot.
1164 */
1165static unsigned long xol_take_insn_slot(struct xol_area *area)
1166{
1167 unsigned long slot_addr;
1168 int slot_nr;
1169
1170 do {
1171 slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1172 if (slot_nr < UINSNS_PER_PAGE) {
1173 if (!test_and_set_bit(slot_nr, area->bitmap))
1174 break;
1175
1176 slot_nr = UINSNS_PER_PAGE;
1177 continue;
1178 }
1179 wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1180 } while (slot_nr >= UINSNS_PER_PAGE);
1181
1182 slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1183 atomic_inc(&area->slot_count);
1184
1185 return slot_addr;
1186}
1187
1188/*
1189 * xol_get_insn_slot - If was not allocated a slot, then
1190 * allocate a slot.
1191 * Returns the allocated slot address or 0.
1192 */
1193static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
1194{
1195 struct xol_area *area;
1196 unsigned long offset;
1197 void *vaddr;
1198
1199 area = get_xol_area(current->mm);
1200 if (!area) {
1201 area = xol_alloc_area();
1202 if (!area)
1203 return 0;
1204 }
1205 current->utask->xol_vaddr = xol_take_insn_slot(area);
1206
1207 /*
1208 * Initialize the slot if xol_vaddr points to valid
1209 * instruction slot.
1210 */
1211 if (unlikely(!current->utask->xol_vaddr))
1212 return 0;
1213
1214 current->utask->vaddr = slot_addr;
1215 offset = current->utask->xol_vaddr & ~PAGE_MASK;
1216 vaddr = kmap_atomic(area->page);
1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1218 kunmap_atomic(vaddr);
1219 /*
1220 * We probably need flush_icache_user_range() but it needs vma.
1221 * This should work on supported architectures too.
1222 */
1223 flush_dcache_page(area->page);
1224
1225 return current->utask->xol_vaddr;
1226}
1227
1228/*
1229 * xol_free_insn_slot - If slot was earlier allocated by
1230 * @xol_get_insn_slot(), make the slot available for
1231 * subsequent requests.
1232 */
1233static void xol_free_insn_slot(struct task_struct *tsk)
1234{
1235 struct xol_area *area;
1236 unsigned long vma_end;
1237 unsigned long slot_addr;
1238
1239 if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1240 return;
1241
1242 slot_addr = tsk->utask->xol_vaddr;
1243
1244 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1245 return;
1246
1247 area = tsk->mm->uprobes_state.xol_area;
1248 vma_end = area->vaddr + PAGE_SIZE;
1249 if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1250 unsigned long offset;
1251 int slot_nr;
1252
1253 offset = slot_addr - area->vaddr;
1254 slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1255 if (slot_nr >= UINSNS_PER_PAGE)
1256 return;
1257
1258 clear_bit(slot_nr, area->bitmap);
1259 atomic_dec(&area->slot_count);
1260 if (waitqueue_active(&area->wq))
1261 wake_up(&area->wq);
1262
1263 tsk->utask->xol_vaddr = 0;
1264 }
1265}
1266
1267/**
1268 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1269 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1270 * instruction.
1271 * Return the address of the breakpoint instruction.
1272 */
1273unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1274{
1275 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1276}
1277
1278/*
1279 * Called with no locks held.
1280 * Called in context of a exiting or a exec-ing thread.
1281 */
1282void uprobe_free_utask(struct task_struct *t)
1283{
1284 struct uprobe_task *utask = t->utask;
1285
1286 if (!utask)
1287 return;
1288
1289 if (utask->active_uprobe)
1290 put_uprobe(utask->active_uprobe);
1291
1292 xol_free_insn_slot(t);
1293 kfree(utask);
1294 t->utask = NULL;
1295}
1296
1297/*
1298 * Called in context of a new clone/fork from copy_process.
1299 */
1300void uprobe_copy_process(struct task_struct *t)
1301{
1302 t->utask = NULL;
1303}
1304
1305/*
1306 * Allocate a uprobe_task object for the task.
1307 * Called when the thread hits a breakpoint for the first time.
1308 *
1309 * Returns:
1310 * - pointer to new uprobe_task on success
1311 * - NULL otherwise
1312 */
1313static struct uprobe_task *add_utask(void)
1314{
1315 struct uprobe_task *utask;
1316
1317 utask = kzalloc(sizeof *utask, GFP_KERNEL);
1318 if (unlikely(!utask))
1319 return NULL;
1320
1321 current->utask = utask;
1322 return utask;
1323}
1324
1325/* Prepare to single-step probed instruction out of line. */
1326static int
1327pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
1328{
1329 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
1330 return 0;
1331
1332 return -EFAULT;
1333}
1334
1335/*
1336 * If we are singlestepping, then ensure this thread is not connected to
1337 * non-fatal signals until completion of singlestep. When xol insn itself
1338 * triggers the signal, restart the original insn even if the task is
1339 * already SIGKILL'ed (since coredump should report the correct ip). This
1340 * is even more important if the task has a handler for SIGSEGV/etc, The
1341 * _same_ instruction should be repeated again after return from the signal
1342 * handler, and SSTEP can never finish in this case.
1343 */
1344bool uprobe_deny_signal(void)
1345{
1346 struct task_struct *t = current;
1347 struct uprobe_task *utask = t->utask;
1348
1349 if (likely(!utask || !utask->active_uprobe))
1350 return false;
1351
1352 WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1353
1354 if (signal_pending(t)) {
1355 spin_lock_irq(&t->sighand->siglock);
1356 clear_tsk_thread_flag(t, TIF_SIGPENDING);
1357 spin_unlock_irq(&t->sighand->siglock);
1358
1359 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1360 utask->state = UTASK_SSTEP_TRAPPED;
1361 set_tsk_thread_flag(t, TIF_UPROBE);
1362 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1363 }
1364 }
1365
1366 return true;
1367}
1368
1369/*
1370 * Avoid singlestepping the original instruction if the original instruction
1371 * is a NOP or can be emulated.
1372 */
1373static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1374{
1375 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1376 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1377 return true;
1378 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1379 }
1380 return false;
1381}
1382
1383static void mmf_recalc_uprobes(struct mm_struct *mm)
1384{
1385 struct vm_area_struct *vma;
1386
1387 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1388 if (!valid_vma(vma, false))
1389 continue;
1390 /*
1391 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called.
1394 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return;
1397 }
1398
1399 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1400}
1401
1402static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1403{
1404 struct page *page;
1405 uprobe_opcode_t opcode;
1406 int result;
1407
1408 pagefault_disable();
1409 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
1410 sizeof(opcode));
1411 pagefault_enable();
1412
1413 if (likely(result == 0))
1414 goto out;
1415
1416 result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1417 if (result < 0)
1418 return result;
1419
1420 copy_opcode(page, vaddr, &opcode);
1421 put_page(page);
1422 out:
1423 return is_swbp_insn(&opcode);
1424}
1425
1426static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1427{
1428 struct mm_struct *mm = current->mm;
1429 struct uprobe *uprobe = NULL;
1430 struct vm_area_struct *vma;
1431
1432 down_read(&mm->mmap_sem);
1433 vma = find_vma(mm, bp_vaddr);
1434 if (vma && vma->vm_start <= bp_vaddr) {
1435 if (valid_vma(vma, false)) {
1436 struct inode *inode = vma->vm_file->f_mapping->host;
1437 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1438
1439 uprobe = find_uprobe(inode, offset);
1440 }
1441
1442 if (!uprobe)
1443 *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
1444 } else {
1445 *is_swbp = -EFAULT;
1446 }
1447
1448 if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
1449 mmf_recalc_uprobes(mm);
1450 up_read(&mm->mmap_sem);
1451
1452 return uprobe;
1453}
1454
1455/*
1456 * Run handler and ask thread to singlestep.
1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1458 */
1459static void handle_swbp(struct pt_regs *regs)
1460{
1461 struct uprobe_task *utask;
1462 struct uprobe *uprobe;
1463 unsigned long bp_vaddr;
1464 int uninitialized_var(is_swbp);
1465
1466 bp_vaddr = uprobe_get_swbp_addr(regs);
1467 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1468
1469 if (!uprobe) {
1470 if (is_swbp > 0) {
1471 /* No matching uprobe; signal SIGTRAP. */
1472 send_sig(SIGTRAP, current, 0);
1473 } else {
1474 /*
1475 * Either we raced with uprobe_unregister() or we can't
1476 * access this memory. The latter is only possible if
1477 * another thread plays with our ->mm. In both cases
1478 * we can simply restart. If this vma was unmapped we
1479 * can pretend this insn was not executed yet and get
1480 * the (correct) SIGSEGV after restart.
1481 */
1482 instruction_pointer_set(regs, bp_vaddr);
1483 }
1484 return;
1485 }
1486 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the
1489 * new and not-yet-analyzed uprobe at the same address, restart.
1490 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart;
1494
1495 utask = current->utask;
1496 if (!utask) {
1497 utask = add_utask();
1498 /* Cannot allocate; re-execute the instruction. */
1499 if (!utask)
1500 goto restart;
1501 }
1502
1503 handler_chain(uprobe, regs);
1504 if (can_skip_sstep(uprobe, regs))
1505 goto out;
1506
1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1510 return;
1511 }
1512
1513restart:
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519out:
1520 put_uprobe(uprobe);
1521}
1522
1523/*
1524 * Perform required fix-ups and disable singlestep.
1525 * Allow pending signals to take effect.
1526 */
1527static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1528{
1529 struct uprobe *uprobe;
1530
1531 uprobe = utask->active_uprobe;
1532 if (utask->state == UTASK_SSTEP_ACK)
1533 arch_uprobe_post_xol(&uprobe->arch, regs);
1534 else if (utask->state == UTASK_SSTEP_TRAPPED)
1535 arch_uprobe_abort_xol(&uprobe->arch, regs);
1536 else
1537 WARN_ON_ONCE(1);
1538
1539 put_uprobe(uprobe);
1540 utask->active_uprobe = NULL;
1541 utask->state = UTASK_RUNNING;
1542 xol_free_insn_slot(current);
1543
1544 spin_lock_irq(&current->sighand->siglock);
1545 recalc_sigpending(); /* see uprobe_deny_signal() */
1546 spin_unlock_irq(&current->sighand->siglock);
1547}
1548
1549/*
1550 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
1551 * allows the thread to return from interrupt. After that handle_swbp()
1552 * sets utask->active_uprobe.
1553 *
1554 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
1555 * and allows the thread to return from interrupt.
1556 *
1557 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1558 * uprobe_notify_resume().
1559 */
1560void uprobe_notify_resume(struct pt_regs *regs)
1561{
1562 struct uprobe_task *utask;
1563
1564 clear_thread_flag(TIF_UPROBE);
1565
1566 utask = current->utask;
1567 if (utask && utask->active_uprobe)
1568 handle_singlestep(utask, regs);
1569 else
1570 handle_swbp(regs);
1571}
1572
1573/*
1574 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1575 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1576 */
1577int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1578{
1579 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
1580 return 0;
1581
1582 set_thread_flag(TIF_UPROBE);
1583 return 1;
1584}
1585
1586/*
1587 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1588 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1589 */
1590int uprobe_post_sstep_notifier(struct pt_regs *regs)
1591{
1592 struct uprobe_task *utask = current->utask;
1593
1594 if (!current->mm || !utask || !utask->active_uprobe)
1595 /* task is currently not uprobed */
1596 return 0;
1597
1598 utask->state = UTASK_SSTEP_ACK;
1599 set_thread_flag(TIF_UPROBE);
1600 return 1;
1601}
1602
1603static struct notifier_block uprobe_exception_nb = {
1604 .notifier_call = arch_uprobe_exception_notify,
1605 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
1606};
1607
1608static int __init init_uprobes(void)
1609{
1610 int i;
1611
1612 for (i = 0; i < UPROBES_HASH_SZ; i++) {
1613 mutex_init(&uprobes_mutex[i]);
1614 mutex_init(&uprobes_mmap_mutex[i]);
1615 }
1616
1617 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM;
1619
1620 return register_die_notifier(&uprobe_exception_nb);
1621}
1622module_init(init_uprobes);
1623
1624static void __exit exit_uprobes(void)
1625{
1626}
1627module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df2193721..9e316ae4984 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,8 +51,6 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
55#include <linux/shm.h>
56 54
57#include <asm/uaccess.h> 55#include <asm/uaccess.h>
58#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -123,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
123 * We won't ever get here for the group leader, since it 121 * We won't ever get here for the group leader, since it
124 * will have been the last reference on the signal_struct. 122 * will have been the last reference on the signal_struct.
125 */ 123 */
126 sig->utime += tsk->utime; 124 sig->utime = cputime_add(sig->utime, tsk->utime);
127 sig->stime += tsk->stime; 125 sig->stime = cputime_add(sig->stime, tsk->stime);
128 sig->gtime += tsk->gtime; 126 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
129 sig->min_flt += tsk->min_flt; 127 sig->min_flt += tsk->min_flt;
130 sig->maj_flt += tsk->maj_flt; 128 sig->maj_flt += tsk->maj_flt;
131 sig->nvcsw += tsk->nvcsw; 129 sig->nvcsw += tsk->nvcsw;
@@ -310,6 +308,43 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
310 } 308 }
311} 309}
312 310
311/**
312 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
313 *
314 * If a kernel thread is launched as a result of a system call, or if
315 * it ever exits, it should generally reparent itself to kthreadd so it
316 * isn't in the way of other processes and is correctly cleaned up on exit.
317 *
318 * The various task state such as scheduling policy and priority may have
319 * been inherited from a user process, so we reset them to sane values here.
320 *
321 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
322 */
323static void reparent_to_kthreadd(void)
324{
325 write_lock_irq(&tasklist_lock);
326
327 ptrace_unlink(current);
328 /* Reparent to init */
329 current->real_parent = current->parent = kthreadd_task;
330 list_move_tail(&current->sibling, &current->real_parent->children);
331
332 /* Set the exit signal to SIGCHLD so we signal init on exit */
333 current->exit_signal = SIGCHLD;
334
335 if (task_nice(current) < 0)
336 set_user_nice(current, 0);
337 /* cpus_allowed? */
338 /* rt_priority? */
339 /* signals? */
340 memcpy(current->signal->rlim, init_task.signal->rlim,
341 sizeof(current->signal->rlim));
342
343 atomic_inc(&init_cred.usage);
344 commit_creds(&init_cred);
345 write_unlock_irq(&tasklist_lock);
346}
347
313void __set_special_pids(struct pid *pid) 348void __set_special_pids(struct pid *pid)
314{ 349{
315 struct task_struct *curr = current->group_leader; 350 struct task_struct *curr = current->group_leader;
@@ -321,6 +356,13 @@ void __set_special_pids(struct pid *pid)
321 change_pid(curr, PIDTYPE_PGID, pid); 356 change_pid(curr, PIDTYPE_PGID, pid);
322} 357}
323 358
359static void set_special_pids(struct pid *pid)
360{
361 write_lock_irq(&tasklist_lock);
362 __set_special_pids(pid);
363 write_unlock_irq(&tasklist_lock);
364}
365
324/* 366/*
325 * Let kernel threads use this to say that they allow a certain signal. 367 * Let kernel threads use this to say that they allow a certain signal.
326 * Must not be used if kthread was cloned with CLONE_SIGHAND. 368 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -360,6 +402,149 @@ int disallow_signal(int sig)
360 402
361EXPORT_SYMBOL(disallow_signal); 403EXPORT_SYMBOL(disallow_signal);
362 404
405/*
406 * Put all the gunge required to become a kernel thread without
407 * attached user resources in one place where it belongs.
408 */
409
410void daemonize(const char *name, ...)
411{
412 va_list args;
413 sigset_t blocked;
414
415 va_start(args, name);
416 vsnprintf(current->comm, sizeof(current->comm), name, args);
417 va_end(args);
418
419 /*
420 * If we were started as result of loading a module, close all of the
421 * user space pages. We don't need them, and if we didn't close them
422 * they would be locked into memory.
423 */
424 exit_mm(current);
425 /*
426 * We don't want to have TIF_FREEZE set if the system-wide hibernation
427 * or suspend transition begins right now.
428 */
429 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
430
431 if (current->nsproxy != &init_nsproxy) {
432 get_nsproxy(&init_nsproxy);
433 switch_task_namespaces(current, &init_nsproxy);
434 }
435 set_special_pids(&init_struct_pid);
436 proc_clear_tty(current);
437
438 /* Block and flush all signals */
439 sigfillset(&blocked);
440 sigprocmask(SIG_BLOCK, &blocked, NULL);
441 flush_signals(current);
442
443 /* Become as one with the init task */
444
445 daemonize_fs_struct();
446 exit_files(current);
447 current->files = init_task.files;
448 atomic_inc(&current->files->count);
449
450 reparent_to_kthreadd();
451}
452
453EXPORT_SYMBOL(daemonize);
454
455static void close_files(struct files_struct * files)
456{
457 int i, j;
458 struct fdtable *fdt;
459
460 j = 0;
461
462 /*
463 * It is safe to dereference the fd table without RCU or
464 * ->file_lock because this is the last reference to the
465 * files structure. But use RCU to shut RCU-lockdep up.
466 */
467 rcu_read_lock();
468 fdt = files_fdtable(files);
469 rcu_read_unlock();
470 for (;;) {
471 unsigned long set;
472 i = j * __NFDBITS;
473 if (i >= fdt->max_fds)
474 break;
475 set = fdt->open_fds->fds_bits[j++];
476 while (set) {
477 if (set & 1) {
478 struct file * file = xchg(&fdt->fd[i], NULL);
479 if (file) {
480 filp_close(file, files);
481 cond_resched();
482 }
483 }
484 i++;
485 set >>= 1;
486 }
487 }
488}
489
490struct files_struct *get_files_struct(struct task_struct *task)
491{
492 struct files_struct *files;
493
494 task_lock(task);
495 files = task->files;
496 if (files)
497 atomic_inc(&files->count);
498 task_unlock(task);
499
500 return files;
501}
502
503void put_files_struct(struct files_struct *files)
504{
505 struct fdtable *fdt;
506
507 if (atomic_dec_and_test(&files->count)) {
508 close_files(files);
509 /*
510 * Free the fd and fdset arrays if we expanded them.
511 * If the fdtable was embedded, pass files for freeing
512 * at the end of the RCU grace period. Otherwise,
513 * you can free files immediately.
514 */
515 rcu_read_lock();
516 fdt = files_fdtable(files);
517 if (fdt != &files->fdtab)
518 kmem_cache_free(files_cachep, files);
519 free_fdtable(fdt);
520 rcu_read_unlock();
521 }
522}
523
524void reset_files_struct(struct files_struct *files)
525{
526 struct task_struct *tsk = current;
527 struct files_struct *old;
528
529 old = tsk->files;
530 task_lock(tsk);
531 tsk->files = files;
532 task_unlock(tsk);
533 put_files_struct(old);
534}
535
536void exit_files(struct task_struct *tsk)
537{
538 struct files_struct * files = tsk->files;
539
540 if (files) {
541 task_lock(tsk);
542 tsk->files = NULL;
543 task_unlock(tsk);
544 put_files_struct(files);
545 }
546}
547
363#ifdef CONFIG_MM_OWNER 548#ifdef CONFIG_MM_OWNER
364/* 549/*
365 * A task is exiting. If it owned this mm, find a new owner for the mm. 550 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -456,7 +641,6 @@ static void exit_mm(struct task_struct * tsk)
456 mm_release(tsk, mm); 641 mm_release(tsk, mm);
457 if (!mm) 642 if (!mm)
458 return; 643 return;
459 sync_mm_rss(mm);
460 /* 644 /*
461 * Serialize with any possible pending coredump. 645 * Serialize with any possible pending coredump.
462 * We must hold mmap_sem around checking core_state 646 * We must hold mmap_sem around checking core_state
@@ -495,17 +679,21 @@ static void exit_mm(struct task_struct * tsk)
495 tsk->mm = NULL; 679 tsk->mm = NULL;
496 up_read(&mm->mmap_sem); 680 up_read(&mm->mmap_sem);
497 enter_lazy_tlb(mm, current); 681 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk);
684 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
685 atomic_dec(&mm->oom_disable_count);
498 task_unlock(tsk); 686 task_unlock(tsk);
499 mm_update_next_owner(mm); 687 mm_update_next_owner(mm);
500 mmput(mm); 688 mmput(mm);
501} 689}
502 690
503/* 691/*
504 * When we die, we re-parent all our children, and try to: 692 * When we die, we re-parent all our children.
505 * 1. give them to another thread in our thread group, if such a member exists 693 * Try to give them to another thread in our thread
506 * 2. give it to the first ancestor process which prctl'd itself as a 694 * group, and if no such member exists, give it to
507 * child_subreaper for its children (like a service manager) 695 * the child reaper process (ie "init") in our pid
508 * 3. give it to the init process (PID 1) in our pid namespace 696 * space.
509 */ 697 */
510static struct task_struct *find_new_reaper(struct task_struct *father) 698static struct task_struct *find_new_reaper(struct task_struct *father)
511 __releases(&tasklist_lock) 699 __releases(&tasklist_lock)
@@ -525,37 +713,17 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
525 713
526 if (unlikely(pid_ns->child_reaper == father)) { 714 if (unlikely(pid_ns->child_reaper == father)) {
527 write_unlock_irq(&tasklist_lock); 715 write_unlock_irq(&tasklist_lock);
528 if (unlikely(pid_ns == &init_pid_ns)) { 716 if (unlikely(pid_ns == &init_pid_ns))
529 panic("Attempted to kill init! exitcode=0x%08x\n", 717 panic("Attempted to kill init!");
530 father->signal->group_exit_code ?:
531 father->exit_code);
532 }
533 718
534 zap_pid_ns_processes(pid_ns); 719 zap_pid_ns_processes(pid_ns);
535 write_lock_irq(&tasklist_lock); 720 write_lock_irq(&tasklist_lock);
536 } else if (father->signal->has_child_subreaper) {
537 struct task_struct *reaper;
538
539 /* 721 /*
540 * Find the first ancestor marked as child_subreaper. 722 * We can not clear ->child_reaper or leave it alone.
541 * Note that the code below checks same_thread_group(reaper, 723 * There may by stealth EXIT_DEAD tasks on ->children,
542 * pid_ns->child_reaper). This is what we need to DTRT in a 724 * forget_original_parent() must move them somewhere.
543 * PID namespace. However we still need the check above, see
544 * http://marc.info/?l=linux-kernel&m=131385460420380
545 */ 725 */
546 for (reaper = father->real_parent; 726 pid_ns->child_reaper = init_pid_ns.child_reaper;
547 reaper != &init_task;
548 reaper = reaper->real_parent) {
549 if (same_thread_group(reaper, pid_ns->child_reaper))
550 break;
551 if (!reaper->signal->is_child_subreaper)
552 continue;
553 thread = reaper;
554 do {
555 if (!(thread->flags & PF_EXITING))
556 return reaper;
557 } while_each_thread(reaper, thread);
558 }
559 } 727 }
560 728
561 return pid_ns->child_reaper; 729 return pid_ns->child_reaper;
@@ -653,6 +821,25 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
653 if (group_dead) 821 if (group_dead)
654 kill_orphaned_pgrp(tsk->group_leader, NULL); 822 kill_orphaned_pgrp(tsk->group_leader, NULL);
655 823
824 /* Let father know we died
825 *
826 * Thread signals are configurable, but you aren't going to use
827 * that to send signals to arbitrary processes.
828 * That stops right now.
829 *
830 * If the parent exec id doesn't match the exec id we saved
831 * when we started then we know the parent has changed security
832 * domain.
833 *
834 * If our self_exec id doesn't match our parent_exec_id then
835 * we have changed execution domain as these two values started
836 * the same after a fork.
837 */
838 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
840 tsk->self_exec_id != tsk->parent_exec_id))
841 tsk->exit_signal = SIGCHLD;
842
656 if (unlikely(tsk->ptrace)) { 843 if (unlikely(tsk->ptrace)) {
657 int sig = thread_group_leader(tsk) && 844 int sig = thread_group_leader(tsk) &&
658 thread_group_empty(tsk) && 845 thread_group_empty(tsk) &&
@@ -692,9 +879,9 @@ static void check_stack_usage(void)
692 879
693 spin_lock(&low_water_lock); 880 spin_lock(&low_water_lock);
694 if (free < lowest_to_date) { 881 if (free < lowest_to_date) {
695 printk(KERN_WARNING "%s (%d) used greatest stack depth: " 882 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
696 "%lu bytes left\n", 883 "left\n",
697 current->comm, task_pid_nr(current), free); 884 current->comm, free);
698 lowest_to_date = free; 885 lowest_to_date = free;
699 } 886 }
700 spin_unlock(&low_water_lock); 887 spin_unlock(&low_water_lock);
@@ -703,7 +890,7 @@ static void check_stack_usage(void)
703static inline void check_stack_usage(void) {} 890static inline void check_stack_usage(void) {}
704#endif 891#endif
705 892
706void do_exit(long code) 893NORET_TYPE void do_exit(long code)
707{ 894{
708 struct task_struct *tsk = current; 895 struct task_struct *tsk = current;
709 int group_dead; 896 int group_dead;
@@ -751,6 +938,8 @@ void do_exit(long code)
751 schedule(); 938 schedule();
752 } 939 }
753 940
941 exit_irq_thread();
942
754 exit_signals(tsk); /* sets PF_EXITING */ 943 exit_signals(tsk); /* sets PF_EXITING */
755 /* 944 /*
756 * tsk->flags are checked in the futex code to protect against 945 * tsk->flags are checked in the futex code to protect against
@@ -767,7 +956,7 @@ void do_exit(long code)
767 acct_update_integrals(tsk); 956 acct_update_integrals(tsk);
768 /* sync mm's RSS info before statistics gathering */ 957 /* sync mm's RSS info before statistics gathering */
769 if (tsk->mm) 958 if (tsk->mm)
770 sync_mm_rss(tsk->mm); 959 sync_mm_rss(tsk, tsk->mm);
771 group_dead = atomic_dec_and_test(&tsk->signal->live); 960 group_dead = atomic_dec_and_test(&tsk->signal->live);
772 if (group_dead) { 961 if (group_dead) {
773 hrtimer_cancel(&tsk->signal->real_timer); 962 hrtimer_cancel(&tsk->signal->real_timer);
@@ -778,7 +967,8 @@ void do_exit(long code)
778 acct_collect(code, group_dead); 967 acct_collect(code, group_dead);
779 if (group_dead) 968 if (group_dead)
780 tty_audit_exit(); 969 tty_audit_exit();
781 audit_free(tsk); 970 if (unlikely(tsk->audit_context))
971 audit_free(tsk);
782 972
783 tsk->exit_code = code; 973 tsk->exit_code = code;
784 taskstats_exit(tsk, group_dead); 974 taskstats_exit(tsk, group_dead);
@@ -793,7 +983,6 @@ void do_exit(long code)
793 exit_shm(tsk); 983 exit_shm(tsk);
794 exit_files(tsk); 984 exit_files(tsk);
795 exit_fs(tsk); 985 exit_fs(tsk);
796 exit_task_work(tsk);
797 check_stack_usage(); 986 check_stack_usage();
798 exit_thread(); 987 exit_thread();
799 988
@@ -847,34 +1036,12 @@ void do_exit(long code)
847 if (tsk->splice_pipe) 1036 if (tsk->splice_pipe)
848 __free_pipe_info(tsk->splice_pipe); 1037 __free_pipe_info(tsk->splice_pipe);
849 1038
850 if (tsk->task_frag.page)
851 put_page(tsk->task_frag.page);
852
853 validate_creds_for_do_exit(tsk); 1039 validate_creds_for_do_exit(tsk);
854 1040
855 preempt_disable(); 1041 preempt_disable();
856 if (tsk->nr_dirtied)
857 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
858 exit_rcu(); 1042 exit_rcu();
859
860 /*
861 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
862 * when the following two conditions become true.
863 * - There is race condition of mmap_sem (It is acquired by
864 * exit_mm()), and
865 * - SMI occurs before setting TASK_RUNINNG.
866 * (or hypervisor of virtual machine switches to other guest)
867 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
868 *
869 * To avoid it, we have to wait for releasing tsk->pi_lock which
870 * is held by try_to_wake_up()
871 */
872 smp_mb();
873 raw_spin_unlock_wait(&tsk->pi_lock);
874
875 /* causes final put_task_struct in finish_task_switch(). */ 1043 /* causes final put_task_struct in finish_task_switch(). */
876 tsk->state = TASK_DEAD; 1044 tsk->state = TASK_DEAD;
877 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
878 schedule(); 1045 schedule();
879 BUG(); 1046 BUG();
880 /* Avoid "noreturn function does return". */ 1047 /* Avoid "noreturn function does return". */
@@ -884,7 +1051,7 @@ void do_exit(long code)
884 1051
885EXPORT_SYMBOL_GPL(do_exit); 1052EXPORT_SYMBOL_GPL(do_exit);
886 1053
887void complete_and_exit(struct completion *comp, long code) 1054NORET_TYPE void complete_and_exit(struct completion *comp, long code)
888{ 1055{
889 if (comp) 1056 if (comp)
890 complete(comp); 1057 complete(comp);
@@ -903,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
903 * Take down every thread in the group. This is called by fatal signals 1070 * Take down every thread in the group. This is called by fatal signals
904 * as well as by sys_exit_group (below). 1071 * as well as by sys_exit_group (below).
905 */ 1072 */
906void 1073NORET_TYPE void
907do_group_exit(int exit_code) 1074do_group_exit(int exit_code)
908{ 1075{
909 struct signal_struct *sig = current->signal; 1076 struct signal_struct *sig = current->signal;
@@ -1024,7 +1191,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1024 unsigned long state; 1191 unsigned long state;
1025 int retval, status, traced; 1192 int retval, status, traced;
1026 pid_t pid = task_pid_vnr(p); 1193 pid_t pid = task_pid_vnr(p);
1027 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1194 uid_t uid = __task_cred(p)->uid;
1028 struct siginfo __user *infop; 1195 struct siginfo __user *infop;
1029 1196
1030 if (!likely(wo->wo_flags & WEXITED)) 1197 if (!likely(wo->wo_flags & WEXITED))
@@ -1082,17 +1249,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1082 * as other threads in the parent group can be right 1249 * as other threads in the parent group can be right
1083 * here reaping other children at the same time. 1250 * here reaping other children at the same time.
1084 * 1251 *
1085 * We use thread_group_cputime_adjusted() to get times for the thread 1252 * We use thread_group_times() to get times for the thread
1086 * group, which consolidates times for all threads in the 1253 * group, which consolidates times for all threads in the
1087 * group including the group leader. 1254 * group including the group leader.
1088 */ 1255 */
1089 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1256 thread_group_times(p, &tgutime, &tgstime);
1090 spin_lock_irq(&p->real_parent->sighand->siglock); 1257 spin_lock_irq(&p->real_parent->sighand->siglock);
1091 psig = p->real_parent->signal; 1258 psig = p->real_parent->signal;
1092 sig = p->signal; 1259 sig = p->signal;
1093 psig->cutime += tgutime + sig->cutime; 1260 psig->cutime =
1094 psig->cstime += tgstime + sig->cstime; 1261 cputime_add(psig->cutime,
1095 psig->cgtime += p->gtime + sig->gtime + sig->cgtime; 1262 cputime_add(tgutime,
1263 sig->cutime));
1264 psig->cstime =
1265 cputime_add(psig->cstime,
1266 cputime_add(tgstime,
1267 sig->cstime));
1268 psig->cgtime =
1269 cputime_add(psig->cgtime,
1270 cputime_add(p->gtime,
1271 cputime_add(sig->gtime,
1272 sig->cgtime)));
1096 psig->cmin_flt += 1273 psig->cmin_flt +=
1097 p->min_flt + sig->min_flt + sig->cmin_flt; 1274 p->min_flt + sig->min_flt + sig->cmin_flt;
1098 psig->cmaj_flt += 1275 psig->cmaj_flt +=
@@ -1237,7 +1414,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1237 if (!unlikely(wo->wo_flags & WNOWAIT)) 1414 if (!unlikely(wo->wo_flags & WNOWAIT))
1238 *p_code = 0; 1415 *p_code = 0;
1239 1416
1240 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1417 uid = task_uid(p);
1241unlock_sig: 1418unlock_sig:
1242 spin_unlock_irq(&p->sighand->siglock); 1419 spin_unlock_irq(&p->sighand->siglock);
1243 if (!exit_code) 1420 if (!exit_code)
@@ -1310,7 +1487,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1310 } 1487 }
1311 if (!unlikely(wo->wo_flags & WNOWAIT)) 1488 if (!unlikely(wo->wo_flags & WNOWAIT))
1312 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1489 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1313 uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1490 uid = task_uid(p);
1314 spin_unlock_irq(&p->sighand->siglock); 1491 spin_unlock_irq(&p->sighand->siglock);
1315 1492
1316 pid = task_pid_vnr(p); 1493 pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf7..5339705b824 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,16 +35,10 @@ DEFINE_MUTEX(text_mutex);
35extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1;
40
41/* Sort the kernel's built-in exception table */ 38/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 39void __init sort_main_extable(void)
43{ 40{
44 if (main_extable_sort_needed) 41 sort_extable(__start___ex_table, __stop___ex_table);
45 sort_extable(__start___ex_table, __stop___ex_table);
46 else
47 pr_notice("__ex_table already sorted, skipping sort\n");
48} 42}
49 43
50/* Given an address, look for it in the exception tables. */ 44/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 65ca6d27f24..f65fa0627c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,7 +34,6 @@
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/seccomp.h>
38#include <linux/swap.h> 37#include <linux/swap.h>
39#include <linux/syscalls.h> 38#include <linux/syscalls.h>
40#include <linux/jiffies.h> 39#include <linux/jiffies.h>
@@ -48,7 +47,6 @@
48#include <linux/audit.h> 47#include <linux/audit.h>
49#include <linux/memcontrol.h> 48#include <linux/memcontrol.h>
50#include <linux/ftrace.h> 49#include <linux/ftrace.h>
51#include <linux/proc_fs.h>
52#include <linux/profile.h> 50#include <linux/profile.h>
53#include <linux/rmap.h> 51#include <linux/rmap.h>
54#include <linux/ksm.h> 52#include <linux/ksm.h>
@@ -68,8 +66,6 @@
68#include <linux/user-return-notifier.h> 66#include <linux/user-return-notifier.h>
69#include <linux/oom.h> 67#include <linux/oom.h>
70#include <linux/khugepaged.h> 68#include <linux/khugepaged.h>
71#include <linux/signalfd.h>
72#include <linux/uprobes.h>
73 69
74#include <asm/pgtable.h> 70#include <asm/pgtable.h>
75#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -80,9 +76,6 @@
80 76
81#include <trace/events/sched.h> 77#include <trace/events/sched.h>
82 78
83#define CREATE_TRACE_POINTS
84#include <trace/events/task.h>
85
86/* 79/*
87 * Protected counters by write_lock_irq(&tasklist_lock) 80 * Protected counters by write_lock_irq(&tasklist_lock)
88 */ 81 */
@@ -114,69 +107,32 @@ int nr_processes(void)
114 return total; 107 return total;
115} 108}
116 109
117void __weak arch_release_task_struct(struct task_struct *tsk) 110#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
118{ 111# define alloc_task_struct_node(node) \
119} 112 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
120 113# define free_task_struct(tsk) \
121#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 114 kmem_cache_free(task_struct_cachep, (tsk))
122static struct kmem_cache *task_struct_cachep; 115static struct kmem_cache *task_struct_cachep;
123
124static inline struct task_struct *alloc_task_struct_node(int node)
125{
126 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
127}
128
129static inline void free_task_struct(struct task_struct *tsk)
130{
131 kmem_cache_free(task_struct_cachep, tsk);
132}
133#endif 116#endif
134 117
135void __weak arch_release_thread_info(struct thread_info *ti) 118#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
136{
137}
138
139#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
140
141/*
142 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
143 * kmemcache based allocator.
144 */
145# if THREAD_SIZE >= PAGE_SIZE
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 119static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 120 int node)
148{ 121{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, 122#ifdef CONFIG_DEBUG_STACK_USAGE
150 THREAD_SIZE_ORDER); 123 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
124#else
125 gfp_t mask = GFP_KERNEL;
126#endif
127 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
151 128
152 return page ? page_address(page) : NULL; 129 return page ? page_address(page) : NULL;
153} 130}
154 131
155static inline void free_thread_info(struct thread_info *ti) 132static inline void free_thread_info(struct thread_info *ti)
156{ 133{
157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); 134 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158}
159# else
160static struct kmem_cache *thread_info_cache;
161
162static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
163 int node)
164{
165 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
166}
167
168static void free_thread_info(struct thread_info *ti)
169{
170 kmem_cache_free(thread_info_cache, ti);
171} 135}
172
173void thread_info_cache_init(void)
174{
175 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
176 THREAD_SIZE, 0, NULL);
177 BUG_ON(thread_info_cache == NULL);
178}
179# endif
180#endif 136#endif
181 137
182/* SLAB cache for signal_struct structures (tsk->signal) */ 138/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -197,6 +153,9 @@ struct kmem_cache *vm_area_cachep;
197/* SLAB cache for mm_struct structures (tsk->mm) */ 153/* SLAB cache for mm_struct structures (tsk->mm) */
198static struct kmem_cache *mm_cachep; 154static struct kmem_cache *mm_cachep;
199 155
156/* Notifier list called when a task struct is freed */
157static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
158
200static void account_kernel_stack(struct thread_info *ti, int account) 159static void account_kernel_stack(struct thread_info *ti, int account)
201{ 160{
202 struct zone *zone = page_zone(virt_to_page(ti)); 161 struct zone *zone = page_zone(virt_to_page(ti));
@@ -206,13 +165,11 @@ static void account_kernel_stack(struct thread_info *ti, int account)
206 165
207void free_task(struct task_struct *tsk) 166void free_task(struct task_struct *tsk)
208{ 167{
168 prop_local_destroy_single(&tsk->dirties);
209 account_kernel_stack(tsk->stack, -1); 169 account_kernel_stack(tsk->stack, -1);
210 arch_release_thread_info(tsk->stack);
211 free_thread_info(tsk->stack); 170 free_thread_info(tsk->stack);
212 rt_mutex_debug_task_free(tsk); 171 rt_mutex_debug_task_free(tsk);
213 ftrace_graph_exit_task(tsk); 172 ftrace_graph_exit_task(tsk);
214 put_seccomp_filter(tsk);
215 arch_release_task_struct(tsk);
216 free_task_struct(tsk); 173 free_task_struct(tsk);
217} 174}
218EXPORT_SYMBOL(free_task); 175EXPORT_SYMBOL(free_task);
@@ -230,27 +187,45 @@ static inline void put_signal_struct(struct signal_struct *sig)
230 free_signal_struct(sig); 187 free_signal_struct(sig);
231} 188}
232 189
190int task_free_register(struct notifier_block *n)
191{
192 return atomic_notifier_chain_register(&task_free_notifier, n);
193}
194EXPORT_SYMBOL(task_free_register);
195
196int task_free_unregister(struct notifier_block *n)
197{
198 return atomic_notifier_chain_unregister(&task_free_notifier, n);
199}
200EXPORT_SYMBOL(task_free_unregister);
201
233void __put_task_struct(struct task_struct *tsk) 202void __put_task_struct(struct task_struct *tsk)
234{ 203{
235 WARN_ON(!tsk->exit_state); 204 WARN_ON(!tsk->exit_state);
236 WARN_ON(atomic_read(&tsk->usage)); 205 WARN_ON(atomic_read(&tsk->usage));
237 WARN_ON(tsk == current); 206 WARN_ON(tsk == current);
238 207
239 security_task_free(tsk);
240 exit_creds(tsk); 208 exit_creds(tsk);
241 delayacct_tsk_free(tsk); 209 delayacct_tsk_free(tsk);
242 put_signal_struct(tsk->signal); 210 put_signal_struct(tsk->signal);
243 211
212 atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
244 if (!profile_handoff_task(tsk)) 213 if (!profile_handoff_task(tsk))
245 free_task(tsk); 214 free_task(tsk);
246} 215}
247EXPORT_SYMBOL_GPL(__put_task_struct); 216EXPORT_SYMBOL_GPL(__put_task_struct);
248 217
249void __init __weak arch_task_cache_init(void) { } 218/*
219 * macro override instead of weak attribute alias, to workaround
220 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
221 */
222#ifndef arch_task_cache_init
223#define arch_task_cache_init()
224#endif
250 225
251void __init fork_init(unsigned long mempages) 226void __init fork_init(unsigned long mempages)
252{ 227{
253#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 228#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
254#ifndef ARCH_MIN_TASKALIGN 229#ifndef ARCH_MIN_TASKALIGN
255#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 230#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
256#endif 231#endif
@@ -297,20 +272,28 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
297 int node = tsk_fork_get_node(orig); 272 int node = tsk_fork_get_node(orig);
298 int err; 273 int err;
299 274
275 prepare_to_copy(orig);
276
300 tsk = alloc_task_struct_node(node); 277 tsk = alloc_task_struct_node(node);
301 if (!tsk) 278 if (!tsk)
302 return NULL; 279 return NULL;
303 280
304 ti = alloc_thread_info_node(tsk, node); 281 ti = alloc_thread_info_node(tsk, node);
305 if (!ti) 282 if (!ti) {
306 goto free_tsk; 283 free_task_struct(tsk);
284 return NULL;
285 }
307 286
308 err = arch_dup_task_struct(tsk, orig); 287 err = arch_dup_task_struct(tsk, orig);
309 if (err) 288 if (err)
310 goto free_ti; 289 goto out;
311 290
312 tsk->stack = ti; 291 tsk->stack = ti;
313 292
293 err = prop_local_init_single(&tsk->dirties);
294 if (err)
295 goto out;
296
314 setup_thread_stack(tsk, orig); 297 setup_thread_stack(tsk, orig);
315 clear_user_return_notifier(tsk); 298 clear_user_return_notifier(tsk);
316 clear_tsk_need_resched(tsk); 299 clear_tsk_need_resched(tsk);
@@ -330,15 +313,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
330 tsk->btrace_seq = 0; 313 tsk->btrace_seq = 0;
331#endif 314#endif
332 tsk->splice_pipe = NULL; 315 tsk->splice_pipe = NULL;
333 tsk->task_frag.page = NULL;
334 316
335 account_kernel_stack(ti, 1); 317 account_kernel_stack(ti, 1);
336 318
337 return tsk; 319 return tsk;
338 320
339free_ti: 321out:
340 free_thread_info(ti); 322 free_thread_info(ti);
341free_tsk:
342 free_task_struct(tsk); 323 free_task_struct(tsk);
343 return NULL; 324 return NULL;
344} 325}
@@ -352,10 +333,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
352 unsigned long charge; 333 unsigned long charge;
353 struct mempolicy *pol; 334 struct mempolicy *pol;
354 335
355 uprobe_start_dup_mmap();
356 down_write(&oldmm->mmap_sem); 336 down_write(&oldmm->mmap_sem);
357 flush_cache_dup_mm(oldmm); 337 flush_cache_dup_mm(oldmm);
358 uprobe_dup_mmap(oldmm, mm);
359 /* 338 /*
360 * Not linked in yet - no deadlock potential: 339 * Not linked in yet - no deadlock potential:
361 */ 340 */
@@ -384,15 +363,16 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
384 struct file *file; 363 struct file *file;
385 364
386 if (mpnt->vm_flags & VM_DONTCOPY) { 365 if (mpnt->vm_flags & VM_DONTCOPY) {
366 long pages = vma_pages(mpnt);
367 mm->total_vm -= pages;
387 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 368 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
388 -vma_pages(mpnt)); 369 -pages);
389 continue; 370 continue;
390 } 371 }
391 charge = 0; 372 charge = 0;
392 if (mpnt->vm_flags & VM_ACCOUNT) { 373 if (mpnt->vm_flags & VM_ACCOUNT) {
393 unsigned long len = vma_pages(mpnt); 374 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
394 375 if (security_vm_enough_memory(len))
395 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
396 goto fail_nomem; 376 goto fail_nomem;
397 charge = len; 377 charge = len;
398 } 378 }
@@ -424,12 +404,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
424 mapping->i_mmap_writable++; 404 mapping->i_mmap_writable++;
425 flush_dcache_mmap_lock(mapping); 405 flush_dcache_mmap_lock(mapping);
426 /* insert tmp into the share list, just after mpnt */ 406 /* insert tmp into the share list, just after mpnt */
427 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 407 vma_prio_tree_add(tmp, mpnt);
428 vma_nonlinear_insert(tmp,
429 &mapping->i_mmap_nonlinear);
430 else
431 vma_interval_tree_insert_after(tmp, mpnt,
432 &mapping->i_mmap);
433 flush_dcache_mmap_unlock(mapping); 408 flush_dcache_mmap_unlock(mapping);
434 mutex_unlock(&mapping->i_mmap_mutex); 409 mutex_unlock(&mapping->i_mmap_mutex);
435 } 410 }
@@ -470,7 +445,6 @@ out:
470 up_write(&mm->mmap_sem); 445 up_write(&mm->mmap_sem);
471 flush_tlb_mm(oldmm); 446 flush_tlb_mm(oldmm);
472 up_write(&oldmm->mmap_sem); 447 up_write(&oldmm->mmap_sem);
473 uprobe_end_dup_mmap();
474 return retval; 448 return retval;
475fail_nomem_anon_vma_fork: 449fail_nomem_anon_vma_fork:
476 mpol_put(pol); 450 mpol_put(pol);
@@ -543,6 +517,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
543 mm->cached_hole_size = ~0UL; 517 mm->cached_hole_size = ~0UL;
544 mm_init_aio(mm); 518 mm_init_aio(mm);
545 mm_init_owner(mm, p); 519 mm_init_owner(mm, p);
520 atomic_set(&mm->oom_disable_count, 0);
546 521
547 if (likely(!mm_alloc_pgd(mm))) { 522 if (likely(!mm_alloc_pgd(mm))) {
548 mm->def_flags = 0; 523 mm->def_flags = 0;
@@ -554,23 +529,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
554 return NULL; 529 return NULL;
555} 530}
556 531
557static void check_mm(struct mm_struct *mm)
558{
559 int i;
560
561 for (i = 0; i < NR_MM_COUNTERS; i++) {
562 long x = atomic_long_read(&mm->rss_stat.count[i]);
563
564 if (unlikely(x))
565 printk(KERN_ALERT "BUG: Bad rss-counter state "
566 "mm:%p idx:%d val:%ld\n", mm, i, x);
567 }
568
569#ifdef CONFIG_TRANSPARENT_HUGEPAGE
570 VM_BUG_ON(mm->pmd_huge_pte);
571#endif
572}
573
574/* 532/*
575 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
576 */ 534 */
@@ -598,7 +556,9 @@ void __mmdrop(struct mm_struct *mm)
598 mm_free_pgd(mm); 556 mm_free_pgd(mm);
599 destroy_context(mm); 557 destroy_context(mm);
600 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
601 check_mm(mm); 559#ifdef CONFIG_TRANSPARENT_HUGEPAGE
560 VM_BUG_ON(mm->pmd_huge_pte);
561#endif
602 free_mm(mm); 562 free_mm(mm);
603} 563}
604EXPORT_SYMBOL_GPL(__mmdrop); 564EXPORT_SYMBOL_GPL(__mmdrop);
@@ -611,7 +571,6 @@ void mmput(struct mm_struct *mm)
611 might_sleep(); 571 might_sleep();
612 572
613 if (atomic_dec_and_test(&mm->mm_users)) { 573 if (atomic_dec_and_test(&mm->mm_users)) {
614 uprobe_clear_state(mm);
615 exit_aio(mm); 574 exit_aio(mm);
616 ksm_exit(mm); 575 ksm_exit(mm);
617 khugepaged_exit(mm); /* must run before exit_mmap */ 576 khugepaged_exit(mm); /* must run before exit_mmap */
@@ -622,6 +581,7 @@ void mmput(struct mm_struct *mm)
622 list_del(&mm->mmlist); 581 list_del(&mm->mmlist);
623 spin_unlock(&mmlist_lock); 582 spin_unlock(&mmlist_lock);
624 } 583 }
584 put_swap_token(mm);
625 if (mm->binfmt) 585 if (mm->binfmt)
626 module_put(mm->binfmt->module); 586 module_put(mm->binfmt->module);
627 mmdrop(mm); 587 mmdrop(mm);
@@ -629,6 +589,26 @@ void mmput(struct mm_struct *mm)
629} 589}
630EXPORT_SYMBOL_GPL(mmput); 590EXPORT_SYMBOL_GPL(mmput);
631 591
592/*
593 * We added or removed a vma mapping the executable. The vmas are only mapped
594 * during exec and are not mapped with the mmap system call.
595 * Callers must hold down_write() on the mm's mmap_sem for these
596 */
597void added_exe_file_vma(struct mm_struct *mm)
598{
599 mm->num_exe_file_vmas++;
600}
601
602void removed_exe_file_vma(struct mm_struct *mm)
603{
604 mm->num_exe_file_vmas--;
605 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
606 fput(mm->exe_file);
607 mm->exe_file = NULL;
608 }
609
610}
611
632void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 612void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
633{ 613{
634 if (new_exe_file) 614 if (new_exe_file)
@@ -636,13 +616,15 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
636 if (mm->exe_file) 616 if (mm->exe_file)
637 fput(mm->exe_file); 617 fput(mm->exe_file);
638 mm->exe_file = new_exe_file; 618 mm->exe_file = new_exe_file;
619 mm->num_exe_file_vmas = 0;
639} 620}
640 621
641struct file *get_mm_exe_file(struct mm_struct *mm) 622struct file *get_mm_exe_file(struct mm_struct *mm)
642{ 623{
643 struct file *exe_file; 624 struct file *exe_file;
644 625
645 /* We need mmap_sem to protect against races with removal of exe_file */ 626 /* We need mmap_sem to protect against races with removal of
627 * VM_EXECUTABLE vmas */
646 down_read(&mm->mmap_sem); 628 down_read(&mm->mmap_sem);
647 exe_file = mm->exe_file; 629 exe_file = mm->exe_file;
648 if (exe_file) 630 if (exe_file)
@@ -684,58 +666,6 @@ struct mm_struct *get_task_mm(struct task_struct *task)
684} 666}
685EXPORT_SYMBOL_GPL(get_task_mm); 667EXPORT_SYMBOL_GPL(get_task_mm);
686 668
687struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
688{
689 struct mm_struct *mm;
690 int err;
691
692 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
693 if (err)
694 return ERR_PTR(err);
695
696 mm = get_task_mm(task);
697 if (mm && mm != current->mm &&
698 !ptrace_may_access(task, mode)) {
699 mmput(mm);
700 mm = ERR_PTR(-EACCES);
701 }
702 mutex_unlock(&task->signal->cred_guard_mutex);
703
704 return mm;
705}
706
707static void complete_vfork_done(struct task_struct *tsk)
708{
709 struct completion *vfork;
710
711 task_lock(tsk);
712 vfork = tsk->vfork_done;
713 if (likely(vfork)) {
714 tsk->vfork_done = NULL;
715 complete(vfork);
716 }
717 task_unlock(tsk);
718}
719
720static int wait_for_vfork_done(struct task_struct *child,
721 struct completion *vfork)
722{
723 int killed;
724
725 freezer_do_not_count();
726 killed = wait_for_completion_killable(vfork);
727 freezer_count();
728
729 if (killed) {
730 task_lock(child);
731 child->vfork_done = NULL;
732 task_unlock(child);
733 }
734
735 put_task_struct(child);
736 return killed;
737}
738
739/* Please note the differences between mmput and mm_release. 669/* Please note the differences between mmput and mm_release.
740 * mmput is called whenever we stop holding onto a mm_struct, 670 * mmput is called whenever we stop holding onto a mm_struct,
741 * error success whatever. 671 * error success whatever.
@@ -751,6 +681,8 @@ static int wait_for_vfork_done(struct task_struct *child,
751 */ 681 */
752void mm_release(struct task_struct *tsk, struct mm_struct *mm) 682void mm_release(struct task_struct *tsk, struct mm_struct *mm)
753{ 683{
684 struct completion *vfork_done = tsk->vfork_done;
685
754 /* Get rid of any futexes when releasing the mm */ 686 /* Get rid of any futexes when releasing the mm */
755#ifdef CONFIG_FUTEX 687#ifdef CONFIG_FUTEX
756 if (unlikely(tsk->robust_list)) { 688 if (unlikely(tsk->robust_list)) {
@@ -767,17 +699,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
767 exit_pi_state_list(tsk); 699 exit_pi_state_list(tsk);
768#endif 700#endif
769 701
770 uprobe_free_utask(tsk);
771
772 /* Get rid of any cached register state */ 702 /* Get rid of any cached register state */
773 deactivate_mm(tsk, mm); 703 deactivate_mm(tsk, mm);
774 704
705 /* notify parent sleeping on vfork() */
706 if (vfork_done) {
707 tsk->vfork_done = NULL;
708 complete(vfork_done);
709 }
710
775 /* 711 /*
776 * If we're exiting normally, clear a user-space tid field if 712 * If we're exiting normally, clear a user-space tid field if
777 * requested. We leave this alone when dying by signal, to leave 713 * requested. We leave this alone when dying by signal, to leave
778 * the value intact in a core dump, and to save the unnecessary 714 * the value intact in a core dump, and to save the unnecessary
779 * trouble, say, a killed vfork parent shouldn't touch this mm. 715 * trouble otherwise. Userland only wants this done for a sys_exit.
780 * Userland only wants this done for a sys_exit.
781 */ 716 */
782 if (tsk->clear_child_tid) { 717 if (tsk->clear_child_tid) {
783 if (!(tsk->flags & PF_SIGNALED) && 718 if (!(tsk->flags & PF_SIGNALED) &&
@@ -792,13 +727,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
792 } 727 }
793 tsk->clear_child_tid = NULL; 728 tsk->clear_child_tid = NULL;
794 } 729 }
795
796 /*
797 * All done, finally we can wake up parent and return this mm to him.
798 * Also kthread_stop() uses this completion for synchronization.
799 */
800 if (tsk->vfork_done)
801 complete_vfork_done(tsk);
802} 730}
803 731
804/* 732/*
@@ -820,12 +748,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
820 memcpy(mm, oldmm, sizeof(*mm)); 748 memcpy(mm, oldmm, sizeof(*mm));
821 mm_init_cpumask(mm); 749 mm_init_cpumask(mm);
822 750
751 /* Initializing for Swap token stuff */
752 mm->token_priority = 0;
753 mm->last_interval = 0;
754
823#ifdef CONFIG_TRANSPARENT_HUGEPAGE 755#ifdef CONFIG_TRANSPARENT_HUGEPAGE
824 mm->pmd_huge_pte = NULL; 756 mm->pmd_huge_pte = NULL;
825#endif 757#endif
826#ifdef CONFIG_NUMA_BALANCING 758
827 mm->first_nid = NUMA_PTE_SCAN_INIT;
828#endif
829 if (!mm_init(mm, tsk)) 759 if (!mm_init(mm, tsk))
830 goto fail_nomem; 760 goto fail_nomem;
831 761
@@ -899,6 +829,12 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
899 goto fail_nomem; 829 goto fail_nomem;
900 830
901good_mm: 831good_mm:
832 /* Initializing for Swap token stuff */
833 mm->token_priority = 0;
834 mm->last_interval = 0;
835 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
836 atomic_inc(&mm->oom_disable_count);
837
902 tsk->mm = mm; 838 tsk->mm = mm;
903 tsk->active_mm = mm; 839 tsk->active_mm = mm;
904 return 0; 840 return 0;
@@ -958,7 +894,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
958{ 894{
959#ifdef CONFIG_BLOCK 895#ifdef CONFIG_BLOCK
960 struct io_context *ioc = current->io_context; 896 struct io_context *ioc = current->io_context;
961 struct io_context *new_ioc;
962 897
963 if (!ioc) 898 if (!ioc)
964 return 0; 899 return 0;
@@ -966,15 +901,15 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
966 * Share io context with parent, if CLONE_IO is set 901 * Share io context with parent, if CLONE_IO is set
967 */ 902 */
968 if (clone_flags & CLONE_IO) { 903 if (clone_flags & CLONE_IO) {
969 ioc_task_link(ioc); 904 tsk->io_context = ioc_task_link(ioc);
970 tsk->io_context = ioc; 905 if (unlikely(!tsk->io_context))
906 return -ENOMEM;
971 } else if (ioprio_valid(ioc->ioprio)) { 907 } else if (ioprio_valid(ioc->ioprio)) {
972 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); 908 tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
973 if (unlikely(!new_ioc)) 909 if (unlikely(!tsk->io_context))
974 return -ENOMEM; 910 return -ENOMEM;
975 911
976 new_ioc->ioprio = ioc->ioprio; 912 tsk->io_context->ioprio = ioc->ioprio;
977 put_io_context(new_ioc);
978 } 913 }
979#endif 914#endif
980 return 0; 915 return 0;
@@ -999,10 +934,8 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
999 934
1000void __cleanup_sighand(struct sighand_struct *sighand) 935void __cleanup_sighand(struct sighand_struct *sighand)
1001{ 936{
1002 if (atomic_dec_and_test(&sighand->count)) { 937 if (atomic_dec_and_test(&sighand->count))
1003 signalfd_cleanup(sighand);
1004 kmem_cache_free(sighand_cachep, sighand); 938 kmem_cache_free(sighand_cachep, sighand);
1005 }
1006} 939}
1007 940
1008 941
@@ -1044,6 +977,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1044 atomic_set(&sig->live, 1); 977 atomic_set(&sig->live, 1);
1045 atomic_set(&sig->sigcnt, 1); 978 atomic_set(&sig->sigcnt, 1);
1046 init_waitqueue_head(&sig->wait_chldexit); 979 init_waitqueue_head(&sig->wait_chldexit);
980 if (clone_flags & CLONE_NEWPID)
981 sig->flags |= SIGNAL_UNKILLABLE;
1047 sig->curr_target = tsk; 982 sig->curr_target = tsk;
1048 init_sigpending(&sig->shared_pending); 983 init_sigpending(&sig->shared_pending);
1049 INIT_LIST_HEAD(&sig->posix_timers); 984 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1061,15 +996,13 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1061 sched_autogroup_fork(sig); 996 sched_autogroup_fork(sig);
1062 997
1063#ifdef CONFIG_CGROUPS 998#ifdef CONFIG_CGROUPS
1064 init_rwsem(&sig->group_rwsem); 999 init_rwsem(&sig->threadgroup_fork_lock);
1065#endif 1000#endif
1066 1001
1002 sig->oom_adj = current->signal->oom_adj;
1067 sig->oom_score_adj = current->signal->oom_score_adj; 1003 sig->oom_score_adj = current->signal->oom_score_adj;
1068 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1004 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1069 1005
1070 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1071 current->signal->is_child_subreaper;
1072
1073 mutex_init(&sig->cred_guard_mutex); 1006 mutex_init(&sig->cred_guard_mutex);
1074 1007
1075 return 0; 1008 return 0;
@@ -1081,7 +1014,9 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
1081 1014
1082 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1015 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1083 new_flags |= PF_FORKNOEXEC; 1016 new_flags |= PF_FORKNOEXEC;
1017 new_flags |= PF_STARTING;
1084 p->flags = new_flags; 1018 p->flags = new_flags;
1019 clear_freeze_flag(p);
1085} 1020}
1086 1021
1087SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1022SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1112,8 +1047,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1112 */ 1047 */
1113static void posix_cpu_timers_init(struct task_struct *tsk) 1048static void posix_cpu_timers_init(struct task_struct *tsk)
1114{ 1049{
1115 tsk->cputime_expires.prof_exp = 0; 1050 tsk->cputime_expires.prof_exp = cputime_zero;
1116 tsk->cputime_expires.virt_exp = 0; 1051 tsk->cputime_expires.virt_exp = cputime_zero;
1117 tsk->cputime_expires.sched_exp = 0; 1052 tsk->cputime_expires.sched_exp = 0;
1118 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1053 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1119 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1054 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1130,6 +1065,7 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1130 */ 1065 */
1131static struct task_struct *copy_process(unsigned long clone_flags, 1066static struct task_struct *copy_process(unsigned long clone_flags,
1132 unsigned long stack_start, 1067 unsigned long stack_start,
1068 struct pt_regs *regs,
1133 unsigned long stack_size, 1069 unsigned long stack_size,
1134 int __user *child_tidptr, 1070 int __user *child_tidptr,
1135 struct pid *pid, 1071 struct pid *pid,
@@ -1137,6 +1073,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1137{ 1073{
1138 int retval; 1074 int retval;
1139 struct task_struct *p; 1075 struct task_struct *p;
1076 int cgroup_callbacks_done = 0;
1140 1077
1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1078 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1142 return ERR_PTR(-EINVAL); 1079 return ERR_PTR(-EINVAL);
@@ -1166,14 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1166 current->signal->flags & SIGNAL_UNKILLABLE) 1103 current->signal->flags & SIGNAL_UNKILLABLE)
1167 return ERR_PTR(-EINVAL); 1104 return ERR_PTR(-EINVAL);
1168 1105
1169 /*
1170 * If the new process will be in a different pid namespace
1171 * don't allow the creation of threads.
1172 */
1173 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
1174 (task_active_pid_ns(current) != current->nsproxy->pid_ns))
1175 return ERR_PTR(-EINVAL);
1176
1177 retval = security_task_create(clone_flags); 1106 retval = security_task_create(clone_flags);
1178 if (retval) 1107 if (retval)
1179 goto fork_out; 1108 goto fork_out;
@@ -1184,7 +1113,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1184 goto fork_out; 1113 goto fork_out;
1185 1114
1186 ftrace_graph_init_task(p); 1115 ftrace_graph_init_task(p);
1187 get_seccomp_filter(p);
1188 1116
1189 rt_mutex_init_task(p); 1117 rt_mutex_init_task(p);
1190 1118
@@ -1228,10 +1156,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1228 1156
1229 init_sigpending(&p->pending); 1157 init_sigpending(&p->pending);
1230 1158
1231 p->utime = p->stime = p->gtime = 0; 1159 p->utime = cputime_zero;
1232 p->utimescaled = p->stimescaled = 0; 1160 p->stime = cputime_zero;
1161 p->gtime = cputime_zero;
1162 p->utimescaled = cputime_zero;
1163 p->stimescaled = cputime_zero;
1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1164#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1234 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1165 p->prev_utime = cputime_zero;
1166 p->prev_stime = cputime_zero;
1235#endif 1167#endif
1236#if defined(SPLIT_RSS_COUNTING) 1168#if defined(SPLIT_RSS_COUNTING)
1237 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1169 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1250,7 +1182,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1250 p->io_context = NULL; 1182 p->io_context = NULL;
1251 p->audit_context = NULL; 1183 p->audit_context = NULL;
1252 if (clone_flags & CLONE_THREAD) 1184 if (clone_flags & CLONE_THREAD)
1253 threadgroup_change_begin(current); 1185 threadgroup_fork_read_lock(current);
1254 cgroup_fork(p); 1186 cgroup_fork(p);
1255#ifdef CONFIG_NUMA 1187#ifdef CONFIG_NUMA
1256 p->mempolicy = mpol_dup(p->mempolicy); 1188 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1264,11 +1196,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1264#ifdef CONFIG_CPUSETS 1196#ifdef CONFIG_CPUSETS
1265 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1197 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1266 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1198 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1267 seqcount_init(&p->mems_allowed_seq);
1268#endif 1199#endif
1269#ifdef CONFIG_TRACE_IRQFLAGS 1200#ifdef CONFIG_TRACE_IRQFLAGS
1270 p->irq_events = 0; 1201 p->irq_events = 0;
1202#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1203 p->hardirqs_enabled = 1;
1204#else
1271 p->hardirqs_enabled = 0; 1205 p->hardirqs_enabled = 0;
1206#endif
1272 p->hardirq_enable_ip = 0; 1207 p->hardirq_enable_ip = 0;
1273 p->hardirq_enable_event = 0; 1208 p->hardirq_enable_event = 0;
1274 p->hardirq_disable_ip = _THIS_IP_; 1209 p->hardirq_disable_ip = _THIS_IP_;
@@ -1290,7 +1225,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290#ifdef CONFIG_DEBUG_MUTEXES 1225#ifdef CONFIG_DEBUG_MUTEXES
1291 p->blocked_on = NULL; /* not blocked yet */ 1226 p->blocked_on = NULL; /* not blocked yet */
1292#endif 1227#endif
1293#ifdef CONFIG_MEMCG 1228#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1294 p->memcg_batch.do_batch = 0; 1229 p->memcg_batch.do_batch = 0;
1295 p->memcg_batch.memcg = NULL; 1230 p->memcg_batch.memcg = NULL;
1296#endif 1231#endif
@@ -1329,7 +1264,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1329 retval = copy_io(clone_flags, p); 1264 retval = copy_io(clone_flags, p);
1330 if (retval) 1265 if (retval)
1331 goto bad_fork_cleanup_namespaces; 1266 goto bad_fork_cleanup_namespaces;
1332 retval = copy_thread(clone_flags, stack_start, stack_size, p); 1267 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1333 if (retval) 1268 if (retval)
1334 goto bad_fork_cleanup_io; 1269 goto bad_fork_cleanup_io;
1335 1270
@@ -1361,7 +1296,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1361 INIT_LIST_HEAD(&p->pi_state_list); 1296 INIT_LIST_HEAD(&p->pi_state_list);
1362 p->pi_state_cache = NULL; 1297 p->pi_state_cache = NULL;
1363#endif 1298#endif
1364 uprobe_copy_process(p);
1365 /* 1299 /*
1366 * sigaltstack should be cleared when sharing the same VM 1300 * sigaltstack should be cleared when sharing the same VM
1367 */ 1301 */
@@ -1380,27 +1314,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1380 clear_all_latency_tracing(p); 1314 clear_all_latency_tracing(p);
1381 1315
1382 /* ok, now we should be set up.. */ 1316 /* ok, now we should be set up.. */
1383 if (clone_flags & CLONE_THREAD) 1317 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1384 p->exit_signal = -1;
1385 else if (clone_flags & CLONE_PARENT)
1386 p->exit_signal = current->group_leader->exit_signal;
1387 else
1388 p->exit_signal = (clone_flags & CSIGNAL);
1389
1390 p->pdeath_signal = 0; 1318 p->pdeath_signal = 0;
1391 p->exit_state = 0; 1319 p->exit_state = 0;
1392 1320
1393 p->nr_dirtied = 0;
1394 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1395 p->dirty_paused_when = 0;
1396
1397 /* 1321 /*
1398 * Ok, make it visible to the rest of the system. 1322 * Ok, make it visible to the rest of the system.
1399 * We dont wake it up yet. 1323 * We dont wake it up yet.
1400 */ 1324 */
1401 p->group_leader = p; 1325 p->group_leader = p;
1402 INIT_LIST_HEAD(&p->thread_group); 1326 INIT_LIST_HEAD(&p->thread_group);
1403 p->task_works = NULL; 1327
1328 /* Now that the task is set up, run cgroup callbacks if
1329 * necessary. We need to run them before the task is visible
1330 * on the tasklist. */
1331 cgroup_fork_callbacks(p);
1332 cgroup_callbacks_done = 1;
1404 1333
1405 /* Need tasklist lock for parent etc handling! */ 1334 /* Need tasklist lock for parent etc handling! */
1406 write_lock_irq(&tasklist_lock); 1335 write_lock_irq(&tasklist_lock);
@@ -1444,10 +1373,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1444 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1373 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1445 1374
1446 if (thread_group_leader(p)) { 1375 if (thread_group_leader(p)) {
1447 if (is_child_reaper(pid)) { 1376 if (is_child_reaper(pid))
1448 ns_of_pid(pid)->child_reaper = p; 1377 p->nsproxy->pid_ns->child_reaper = p;
1449 p->signal->flags |= SIGNAL_UNKILLABLE;
1450 }
1451 1378
1452 p->signal->leader_pid = pid; 1379 p->signal->leader_pid = pid;
1453 p->signal->tty = tty_kref_get(current->signal->tty); 1380 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1467,11 +1394,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1467 proc_fork_connector(p); 1394 proc_fork_connector(p);
1468 cgroup_post_fork(p); 1395 cgroup_post_fork(p);
1469 if (clone_flags & CLONE_THREAD) 1396 if (clone_flags & CLONE_THREAD)
1470 threadgroup_change_end(current); 1397 threadgroup_fork_read_unlock(current);
1471 perf_event_fork(p); 1398 perf_event_fork(p);
1472
1473 trace_task_newtask(p, clone_flags);
1474
1475 return p; 1399 return p;
1476 1400
1477bad_fork_free_pid: 1401bad_fork_free_pid:
@@ -1483,8 +1407,13 @@ bad_fork_cleanup_io:
1483bad_fork_cleanup_namespaces: 1407bad_fork_cleanup_namespaces:
1484 exit_task_namespaces(p); 1408 exit_task_namespaces(p);
1485bad_fork_cleanup_mm: 1409bad_fork_cleanup_mm:
1486 if (p->mm) 1410 if (p->mm) {
1411 task_lock(p);
1412 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1413 atomic_dec(&p->mm->oom_disable_count);
1414 task_unlock(p);
1487 mmput(p->mm); 1415 mmput(p->mm);
1416 }
1488bad_fork_cleanup_signal: 1417bad_fork_cleanup_signal:
1489 if (!(clone_flags & CLONE_THREAD)) 1418 if (!(clone_flags & CLONE_THREAD))
1490 free_signal_struct(p->signal); 1419 free_signal_struct(p->signal);
@@ -1505,8 +1434,8 @@ bad_fork_cleanup_policy:
1505bad_fork_cleanup_cgroup: 1434bad_fork_cleanup_cgroup:
1506#endif 1435#endif
1507 if (clone_flags & CLONE_THREAD) 1436 if (clone_flags & CLONE_THREAD)
1508 threadgroup_change_end(current); 1437 threadgroup_fork_read_unlock(current);
1509 cgroup_exit(p, 0); 1438 cgroup_exit(p, cgroup_callbacks_done);
1510 delayacct_tsk_free(p); 1439 delayacct_tsk_free(p);
1511 module_put(task_thread_info(p)->exec_domain->module); 1440 module_put(task_thread_info(p)->exec_domain->module);
1512bad_fork_cleanup_count: 1441bad_fork_cleanup_count:
@@ -1518,6 +1447,12 @@ fork_out:
1518 return ERR_PTR(retval); 1447 return ERR_PTR(retval);
1519} 1448}
1520 1449
1450noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1451{
1452 memset(regs, 0, sizeof(struct pt_regs));
1453 return regs;
1454}
1455
1521static inline void init_idle_pids(struct pid_link *links) 1456static inline void init_idle_pids(struct pid_link *links)
1522{ 1457{
1523 enum pid_type type; 1458 enum pid_type type;
@@ -1531,7 +1466,10 @@ static inline void init_idle_pids(struct pid_link *links)
1531struct task_struct * __cpuinit fork_idle(int cpu) 1466struct task_struct * __cpuinit fork_idle(int cpu)
1532{ 1467{
1533 struct task_struct *task; 1468 struct task_struct *task;
1534 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); 1469 struct pt_regs regs;
1470
1471 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1472 &init_struct_pid, 0);
1535 if (!IS_ERR(task)) { 1473 if (!IS_ERR(task)) {
1536 init_idle_pids(task->pids); 1474 init_idle_pids(task->pids);
1537 init_idle(task, cpu); 1475 init_idle(task, cpu);
@@ -1548,6 +1486,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1548 */ 1486 */
1549long do_fork(unsigned long clone_flags, 1487long do_fork(unsigned long clone_flags,
1550 unsigned long stack_start, 1488 unsigned long stack_start,
1489 struct pt_regs *regs,
1551 unsigned long stack_size, 1490 unsigned long stack_size,
1552 int __user *parent_tidptr, 1491 int __user *parent_tidptr,
1553 int __user *child_tidptr) 1492 int __user *child_tidptr)
@@ -1560,9 +1499,15 @@ long do_fork(unsigned long clone_flags,
1560 * Do some preliminary argument and permissions checking before we 1499 * Do some preliminary argument and permissions checking before we
1561 * actually start allocating stuff 1500 * actually start allocating stuff
1562 */ 1501 */
1563 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { 1502 if (clone_flags & CLONE_NEWUSER) {
1564 if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) 1503 if (clone_flags & CLONE_THREAD)
1565 return -EINVAL; 1504 return -EINVAL;
1505 /* hopefully this check will go away when userns support is
1506 * complete
1507 */
1508 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1509 !capable(CAP_SETGID))
1510 return -EPERM;
1566 } 1511 }
1567 1512
1568 /* 1513 /*
@@ -1571,7 +1516,7 @@ long do_fork(unsigned long clone_flags,
1571 * requested, no event is reported; otherwise, report if the event 1516 * requested, no event is reported; otherwise, report if the event
1572 * for the type of forking is enabled. 1517 * for the type of forking is enabled.
1573 */ 1518 */
1574 if (!(clone_flags & CLONE_UNTRACED)) { 1519 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
1575 if (clone_flags & CLONE_VFORK) 1520 if (clone_flags & CLONE_VFORK)
1576 trace = PTRACE_EVENT_VFORK; 1521 trace = PTRACE_EVENT_VFORK;
1577 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1522 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1583,7 +1528,7 @@ long do_fork(unsigned long clone_flags,
1583 trace = 0; 1528 trace = 0;
1584 } 1529 }
1585 1530
1586 p = copy_process(clone_flags, stack_start, stack_size, 1531 p = copy_process(clone_flags, stack_start, regs, stack_size,
1587 child_tidptr, NULL, trace); 1532 child_tidptr, NULL, trace);
1588 /* 1533 /*
1589 * Do this prior waking up the new thread - the thread pointer 1534 * Do this prior waking up the new thread - the thread pointer
@@ -1602,9 +1547,18 @@ long do_fork(unsigned long clone_flags,
1602 if (clone_flags & CLONE_VFORK) { 1547 if (clone_flags & CLONE_VFORK) {
1603 p->vfork_done = &vfork; 1548 p->vfork_done = &vfork;
1604 init_completion(&vfork); 1549 init_completion(&vfork);
1605 get_task_struct(p);
1606 } 1550 }
1607 1551
1552 audit_finish_fork(p);
1553
1554 /*
1555 * We set PF_STARTING at creation in case tracing wants to
1556 * use this to distinguish a fully live task from one that
1557 * hasn't finished SIGSTOP raising yet. Now we clear it
1558 * and set the child going.
1559 */
1560 p->flags &= ~PF_STARTING;
1561
1608 wake_up_new_task(p); 1562 wake_up_new_task(p);
1609 1563
1610 /* forking complete and child started to run, tell ptracer */ 1564 /* forking complete and child started to run, tell ptracer */
@@ -1612,8 +1566,10 @@ long do_fork(unsigned long clone_flags,
1612 ptrace_event(trace, nr); 1566 ptrace_event(trace, nr);
1613 1567
1614 if (clone_flags & CLONE_VFORK) { 1568 if (clone_flags & CLONE_VFORK) {
1615 if (!wait_for_vfork_done(p, &vfork)) 1569 freezer_do_not_count();
1616 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); 1570 wait_for_completion(&vfork);
1571 freezer_count();
1572 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1617 } 1573 }
1618 } else { 1574 } else {
1619 nr = PTR_ERR(p); 1575 nr = PTR_ERR(p);
@@ -1621,58 +1577,6 @@ long do_fork(unsigned long clone_flags,
1621 return nr; 1577 return nr;
1622} 1578}
1623 1579
1624/*
1625 * Create a kernel thread.
1626 */
1627pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1628{
1629 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1630 (unsigned long)arg, NULL, NULL);
1631}
1632
1633#ifdef __ARCH_WANT_SYS_FORK
1634SYSCALL_DEFINE0(fork)
1635{
1636#ifdef CONFIG_MMU
1637 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1638#else
1639 /* can not support in nommu mode */
1640 return(-EINVAL);
1641#endif
1642}
1643#endif
1644
1645#ifdef __ARCH_WANT_SYS_VFORK
1646SYSCALL_DEFINE0(vfork)
1647{
1648 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1649 0, NULL, NULL);
1650}
1651#endif
1652
1653#ifdef __ARCH_WANT_SYS_CLONE
1654#ifdef CONFIG_CLONE_BACKWARDS
1655SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1656 int __user *, parent_tidptr,
1657 int, tls_val,
1658 int __user *, child_tidptr)
1659#elif defined(CONFIG_CLONE_BACKWARDS2)
1660SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1661 int __user *, parent_tidptr,
1662 int __user *, child_tidptr,
1663 int, tls_val)
1664#else
1665SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1666 int __user *, parent_tidptr,
1667 int __user *, child_tidptr,
1668 int, tls_val)
1669#endif
1670{
1671 return do_fork(clone_flags, newsp, 0,
1672 parent_tidptr, child_tidptr);
1673}
1674#endif
1675
1676#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1580#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1677#define ARCH_MIN_MMSTRUCT_ALIGN 0 1581#define ARCH_MIN_MMSTRUCT_ALIGN 0
1678#endif 1582#endif
@@ -1722,8 +1626,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
1722{ 1626{
1723 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1627 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1724 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1628 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1725 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| 1629 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1726 CLONE_NEWUSER|CLONE_NEWPID))
1727 return -EINVAL; 1630 return -EINVAL;
1728 /* 1631 /*
1729 * Not implemented, but pretend it works if there is nothing to 1632 * Not implemented, but pretend it works if there is nothing to
@@ -1790,40 +1693,19 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1790{ 1693{
1791 struct fs_struct *fs, *new_fs = NULL; 1694 struct fs_struct *fs, *new_fs = NULL;
1792 struct files_struct *fd, *new_fd = NULL; 1695 struct files_struct *fd, *new_fd = NULL;
1793 struct cred *new_cred = NULL;
1794 struct nsproxy *new_nsproxy = NULL; 1696 struct nsproxy *new_nsproxy = NULL;
1795 int do_sysvsem = 0; 1697 int do_sysvsem = 0;
1796 int err; 1698 int err;
1797 1699
1798 /* 1700 err = check_unshare_flags(unshare_flags);
1799 * If unsharing a user namespace must also unshare the thread. 1701 if (err)
1800 */ 1702 goto bad_unshare_out;
1801 if (unshare_flags & CLONE_NEWUSER) 1703
1802 unshare_flags |= CLONE_THREAD;
1803 /*
1804 * If unsharing a pid namespace must also unshare the thread.
1805 */
1806 if (unshare_flags & CLONE_NEWPID)
1807 unshare_flags |= CLONE_THREAD;
1808 /*
1809 * If unsharing a thread from a thread group, must also unshare vm.
1810 */
1811 if (unshare_flags & CLONE_THREAD)
1812 unshare_flags |= CLONE_VM;
1813 /*
1814 * If unsharing vm, must also unshare signal handlers.
1815 */
1816 if (unshare_flags & CLONE_VM)
1817 unshare_flags |= CLONE_SIGHAND;
1818 /* 1704 /*
1819 * If unsharing namespace, must also unshare filesystem information. 1705 * If unsharing namespace, must also unshare filesystem information.
1820 */ 1706 */
1821 if (unshare_flags & CLONE_NEWNS) 1707 if (unshare_flags & CLONE_NEWNS)
1822 unshare_flags |= CLONE_FS; 1708 unshare_flags |= CLONE_FS;
1823
1824 err = check_unshare_flags(unshare_flags);
1825 if (err)
1826 goto bad_unshare_out;
1827 /* 1709 /*
1828 * CLONE_NEWIPC must also detach from the undolist: after switching 1710 * CLONE_NEWIPC must also detach from the undolist: after switching
1829 * to a new ipc namespace, the semaphore arrays from the old 1711 * to a new ipc namespace, the semaphore arrays from the old
@@ -1837,15 +1719,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1837 err = unshare_fd(unshare_flags, &new_fd); 1719 err = unshare_fd(unshare_flags, &new_fd);
1838 if (err) 1720 if (err)
1839 goto bad_unshare_cleanup_fs; 1721 goto bad_unshare_cleanup_fs;
1840 err = unshare_userns(unshare_flags, &new_cred); 1722 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
1841 if (err) 1723 if (err)
1842 goto bad_unshare_cleanup_fd; 1724 goto bad_unshare_cleanup_fd;
1843 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1844 new_cred, new_fs);
1845 if (err)
1846 goto bad_unshare_cleanup_cred;
1847 1725
1848 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { 1726 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1849 if (do_sysvsem) { 1727 if (do_sysvsem) {
1850 /* 1728 /*
1851 * CLONE_SYSVSEM is equivalent to sys_exit(). 1729 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1878,20 +1756,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1878 } 1756 }
1879 1757
1880 task_unlock(current); 1758 task_unlock(current);
1881
1882 if (new_cred) {
1883 /* Install the new user namespace */
1884 commit_creds(new_cred);
1885 new_cred = NULL;
1886 }
1887 } 1759 }
1888 1760
1889 if (new_nsproxy) 1761 if (new_nsproxy)
1890 put_nsproxy(new_nsproxy); 1762 put_nsproxy(new_nsproxy);
1891 1763
1892bad_unshare_cleanup_cred:
1893 if (new_cred)
1894 put_cred(new_cred);
1895bad_unshare_cleanup_fd: 1764bad_unshare_cleanup_fd:
1896 if (new_fd) 1765 if (new_fd)
1897 put_files_struct(new_fd); 1766 put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efb..7b01de98bb6 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,159 +6,161 @@
6 6
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/suspend.h> 8#include <linux/suspend.h>
9#include <linux/export.h> 9#include <linux/module.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12#include <linux/kthread.h>
13 12
14/* total number of freezing conditions in effect */ 13/*
15atomic_t system_freezing_cnt = ATOMIC_INIT(0); 14 * freezing is complete, mark current process as frozen
16EXPORT_SYMBOL(system_freezing_cnt);
17
18/* indicate whether PM freezing is in effect, protected by pm_mutex */
19bool pm_freezing;
20bool pm_nosig_freezing;
21
22/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock);
24
25/**
26 * freezing_slow_path - slow path for testing whether a task needs to be frozen
27 * @p: task to be tested
28 *
29 * This function is called by freezing() if system_freezing_cnt isn't zero
30 * and tests whether @p needs to enter and stay in frozen state. Can be
31 * called under any context. The freezers are responsible for ensuring the
32 * target tasks see the updated state.
33 */ 15 */
34bool freezing_slow_path(struct task_struct *p) 16static inline void frozen_process(void)
35{ 17{
36 if (p->flags & PF_NOFREEZE) 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
37 return false; 19 current->flags |= PF_FROZEN;
38 20 smp_wmb();
39 if (pm_nosig_freezing || cgroup_freezing(p)) 21 }
40 return true; 22 clear_freeze_flag(current);
41
42 if (pm_freezing && !(p->flags & PF_KTHREAD))
43 return true;
44
45 return false;
46} 23}
47EXPORT_SYMBOL(freezing_slow_path);
48 24
49/* Refrigerator is place where frozen processes are stored :-). */ 25/* Refrigerator is place where frozen processes are stored :-). */
50bool __refrigerator(bool check_kthr_stop) 26void refrigerator(void)
51{ 27{
52 /* Hmm, should we be allowed to suspend when there are realtime 28 /* Hmm, should we be allowed to suspend when there are realtime
53 processes around? */ 29 processes around? */
54 bool was_frozen = false; 30 long save;
55 long save = current->state; 31
56 32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
57 pr_debug("%s entered refrigerator\n", current->comm); 41 pr_debug("%s entered refrigerator\n", current->comm);
58 42
59 for (;;) { 43 spin_lock_irq(&current->sighand->siglock);
60 set_current_state(TASK_UNINTERRUPTIBLE); 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
61 46
62 spin_lock_irq(&freezer_lock); 47 /* prevent accounting of that task to load */
63 current->flags |= PF_FROZEN; 48 current->flags |= PF_FREEZING;
64 if (!freezing(current) ||
65 (check_kthr_stop && kthread_should_stop()))
66 current->flags &= ~PF_FROZEN;
67 spin_unlock_irq(&freezer_lock);
68 49
69 if (!(current->flags & PF_FROZEN)) 50 for (;;) {
51 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current))
70 break; 53 break;
71 was_frozen = true;
72 schedule(); 54 schedule();
73 } 55 }
74 56
75 pr_debug("%s left refrigerator\n", current->comm); 57 /* Remove the accounting blocker */
76 58 current->flags &= ~PF_FREEZING;
77 /*
78 * Restore saved task state before returning. The mb'd version
79 * needs to be used; otherwise, it might silently break
80 * synchronization which depends on ordered task state change.
81 */
82 set_current_state(save);
83 59
84 return was_frozen; 60 pr_debug("%s left refrigerator\n", current->comm);
61 __set_current_state(save);
85} 62}
86EXPORT_SYMBOL(__refrigerator); 63EXPORT_SYMBOL(refrigerator);
87 64
88static void fake_signal_wake_up(struct task_struct *p) 65static void fake_signal_wake_up(struct task_struct *p)
89{ 66{
90 unsigned long flags; 67 unsigned long flags;
91 68
92 if (lock_task_sighand(p, &flags)) { 69 spin_lock_irqsave(&p->sighand->siglock, flags);
93 signal_wake_up(p, 0); 70 signal_wake_up(p, 0);
94 unlock_task_sighand(p, &flags); 71 spin_unlock_irqrestore(&p->sighand->siglock, flags);
95 }
96} 72}
97 73
98/** 74/**
99 * freeze_task - send a freeze request to given task 75 * freeze_task - send a freeze request to given task
100 * @p: task to send the request to 76 * @p: task to send the request to
77 * @sig_only: if set, the request will only be sent if the task has the
78 * PF_FREEZER_NOSIG flag unset
79 * Return value: 'false', if @sig_only is set and the task has
80 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
101 * 81 *
102 * If @p is freezing, the freeze request is sent either by sending a fake 82 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
103 * signal (if it's not a kernel thread) or waking it up (if it's a kernel 83 * either sending a fake signal to it or waking it up, depending on whether
104 * thread). 84 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
105 * 85 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
106 * RETURNS: 86 * TIF_FREEZE flag will not be set.
107 * %false, if @p is not freezing or already frozen; %true, otherwise
108 */ 87 */
109bool freeze_task(struct task_struct *p) 88bool freeze_task(struct task_struct *p, bool sig_only)
110{ 89{
111 unsigned long flags; 90 /*
112 91 * We first check if the task is freezing and next if it has already
113 spin_lock_irqsave(&freezer_lock, flags); 92 * been frozen to avoid the race with frozen_process() which first marks
114 if (!freezing(p) || frozen(p)) { 93 * the task as frozen and next clears its TIF_FREEZE.
115 spin_unlock_irqrestore(&freezer_lock, flags); 94 */
116 return false; 95 if (!freezing(p)) {
96 smp_rmb();
97 if (frozen(p))
98 return false;
99
100 if (!sig_only || should_send_signal(p))
101 set_freeze_flag(p);
102 else
103 return false;
117 } 104 }
118 105
119 if (!(p->flags & PF_KTHREAD)) 106 if (should_send_signal(p)) {
120 fake_signal_wake_up(p); 107 fake_signal_wake_up(p);
121 else 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
114 } else if (sig_only) {
115 return false;
116 } else {
122 wake_up_state(p, TASK_INTERRUPTIBLE); 117 wake_up_state(p, TASK_INTERRUPTIBLE);
118 }
123 119
124 spin_unlock_irqrestore(&freezer_lock, flags);
125 return true; 120 return true;
126} 121}
127 122
128void __thaw_task(struct task_struct *p) 123void cancel_freezing(struct task_struct *p)
129{ 124{
130 unsigned long flags; 125 unsigned long flags;
131 126
132 /* 127 if (freezing(p)) {
133 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to 128 pr_debug(" clean up: %s\n", p->comm);
134 * be visible to @p as waking up implies wmb. Waking up inside 129 clear_freeze_flag(p);
135 * freezer_lock also prevents wakeups from leaking outside 130 spin_lock_irqsave(&p->sighand->siglock, flags);
136 * refrigerator. 131 recalc_sigpending_and_wake(p);
137 */ 132 spin_unlock_irqrestore(&p->sighand->siglock, flags);
138 spin_lock_irqsave(&freezer_lock, flags); 133 }
139 if (frozen(p))
140 wake_up_process(p);
141 spin_unlock_irqrestore(&freezer_lock, flags);
142} 134}
143 135
144/** 136static int __thaw_process(struct task_struct *p)
145 * set_freezable - make %current freezable 137{
138 if (frozen(p)) {
139 p->flags &= ~PF_FROZEN;
140 return 1;
141 }
142 clear_freeze_flag(p);
143 return 0;
144}
145
146/*
147 * Wake up a frozen process
146 * 148 *
147 * Mark %current freezable and enter refrigerator if necessary. 149 * task_lock() is needed to prevent the race with refrigerator() which may
150 * occur if the freezing of tasks fails. Namely, without the lock, if the
151 * freezing of tasks failed, thaw_tasks() might have run before a task in
152 * refrigerator() could call frozen_process(), in which case the task would be
153 * frozen and no one would thaw it.
148 */ 154 */
149bool set_freezable(void) 155int thaw_process(struct task_struct *p)
150{ 156{
151 might_sleep(); 157 task_lock(p);
152 158 if (__thaw_process(p) == 1) {
153 /* 159 task_unlock(p);
154 * Modify flags while holding freezer_lock. This ensures the 160 wake_up_process(p);
155 * freezer notices that we aren't frozen yet or the freezing 161 return 1;
156 * condition is visible to try_to_freeze() below. 162 }
157 */ 163 task_unlock(p);
158 spin_lock_irq(&freezer_lock); 164 return 0;
159 current->flags &= ~PF_NOFREEZE;
160 spin_unlock_irq(&freezer_lock);
161
162 return try_to_freeze();
163} 165}
164EXPORT_SYMBOL(set_freezable); 166EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca00..e6160fa842e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,11 +55,10 @@
55#include <linux/pagemap.h> 55#include <linux/pagemap.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/signal.h> 57#include <linux/signal.h>
58#include <linux/export.h> 58#include <linux/module.h>
59#include <linux/magic.h> 59#include <linux/magic.h>
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h>
63 62
64#include <asm/futex.h> 63#include <asm/futex.h>
65 64
@@ -716,7 +715,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
716 struct futex_pi_state **ps, 715 struct futex_pi_state **ps,
717 struct task_struct *task, int set_waiters) 716 struct task_struct *task, int set_waiters)
718{ 717{
719 int lock_taken, ret, force_take = 0; 718 int lock_taken, ret, ownerdied = 0;
720 u32 uval, newval, curval, vpid = task_pid_vnr(task); 719 u32 uval, newval, curval, vpid = task_pid_vnr(task);
721 720
722retry: 721retry:
@@ -755,15 +754,17 @@ retry:
755 newval = curval | FUTEX_WAITERS; 754 newval = curval | FUTEX_WAITERS;
756 755
757 /* 756 /*
758 * Should we force take the futex? See below. 757 * There are two cases, where a futex might have no owner (the
758 * owner TID is 0): OWNER_DIED. We take over the futex in this
759 * case. We also do an unconditional take over, when the owner
760 * of the futex died.
761 *
762 * This is safe as we are protected by the hash bucket lock !
759 */ 763 */
760 if (unlikely(force_take)) { 764 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
761 /* 765 /* Keep the OWNER_DIED bit */
762 * Keep the OWNER_DIED and the WAITERS bit and set the
763 * new TID value.
764 */
765 newval = (curval & ~FUTEX_TID_MASK) | vpid; 766 newval = (curval & ~FUTEX_TID_MASK) | vpid;
766 force_take = 0; 767 ownerdied = 0;
767 lock_taken = 1; 768 lock_taken = 1;
768 } 769 }
769 770
@@ -773,7 +774,7 @@ retry:
773 goto retry; 774 goto retry;
774 775
775 /* 776 /*
776 * We took the lock due to forced take over. 777 * We took the lock due to owner died take over.
777 */ 778 */
778 if (unlikely(lock_taken)) 779 if (unlikely(lock_taken))
779 return 1; 780 return 1;
@@ -788,25 +789,20 @@ retry:
788 switch (ret) { 789 switch (ret) {
789 case -ESRCH: 790 case -ESRCH:
790 /* 791 /*
791 * We failed to find an owner for this 792 * No owner found for this futex. Check if the
792 * futex. So we have no pi_state to block 793 * OWNER_DIED bit is set to figure out whether
793 * on. This can happen in two cases: 794 * this is a robust futex or not.
794 *
795 * 1) The owner died
796 * 2) A stale FUTEX_WAITERS bit
797 *
798 * Re-read the futex value.
799 */ 795 */
800 if (get_futex_value_locked(&curval, uaddr)) 796 if (get_futex_value_locked(&curval, uaddr))
801 return -EFAULT; 797 return -EFAULT;
802 798
803 /* 799 /*
804 * If the owner died or we have a stale 800 * We simply start over in case of a robust
805 * WAITERS bit the owner TID in the user space 801 * futex. The code above will take the futex
806 * futex is 0. 802 * and return happy.
807 */ 803 */
808 if (!(curval & FUTEX_TID_MASK)) { 804 if (curval & FUTEX_OWNER_DIED) {
809 force_take = 1; 805 ownerdied = 1;
810 goto retry; 806 goto retry;
811 } 807 }
812 default: 808 default:
@@ -843,9 +839,6 @@ static void wake_futex(struct futex_q *q)
843{ 839{
844 struct task_struct *p = q->task; 840 struct task_struct *p = q->task;
845 841
846 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
847 return;
848
849 /* 842 /*
850 * We set q->lock_ptr = NULL _before_ we wake up the task. If 843 * We set q->lock_ptr = NULL _before_ we wake up the task. If
851 * a non-futex wake up happens on another CPU then the task 844 * a non-futex wake up happens on another CPU then the task
@@ -873,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
873{ 866{
874 struct task_struct *new_owner; 867 struct task_struct *new_owner;
875 struct futex_pi_state *pi_state = this->pi_state; 868 struct futex_pi_state *pi_state = this->pi_state;
876 u32 uninitialized_var(curval), newval; 869 u32 curval, newval;
877 870
878 if (!pi_state) 871 if (!pi_state)
879 return -EINVAL; 872 return -EINVAL;
@@ -935,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
935 928
936static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 929static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
937{ 930{
938 u32 uninitialized_var(oldval); 931 u32 oldval;
939 932
940 /* 933 /*
941 * There is no waiter, so we unlock the futex. The owner died 934 * There is no waiter, so we unlock the futex. The owner died
@@ -1081,10 +1074,6 @@ retry_private:
1081 1074
1082 plist_for_each_entry_safe(this, next, head, list) { 1075 plist_for_each_entry_safe(this, next, head, list) {
1083 if (match_futex (&this->key, &key1)) { 1076 if (match_futex (&this->key, &key1)) {
1084 if (this->pi_state || this->rt_waiter) {
1085 ret = -EINVAL;
1086 goto out_unlock;
1087 }
1088 wake_futex(this); 1077 wake_futex(this);
1089 if (++ret >= nr_wake) 1078 if (++ret >= nr_wake)
1090 break; 1079 break;
@@ -1097,10 +1086,6 @@ retry_private:
1097 op_ret = 0; 1086 op_ret = 0;
1098 plist_for_each_entry_safe(this, next, head, list) { 1087 plist_for_each_entry_safe(this, next, head, list) {
1099 if (match_futex (&this->key, &key2)) { 1088 if (match_futex (&this->key, &key2)) {
1100 if (this->pi_state || this->rt_waiter) {
1101 ret = -EINVAL;
1102 goto out_unlock;
1103 }
1104 wake_futex(this); 1089 wake_futex(this);
1105 if (++op_ret >= nr_wake2) 1090 if (++op_ret >= nr_wake2)
1106 break; 1091 break;
@@ -1109,7 +1094,6 @@ retry_private:
1109 ret += op_ret; 1094 ret += op_ret;
1110 } 1095 }
1111 1096
1112out_unlock:
1113 double_unlock_hb(hb1, hb2); 1097 double_unlock_hb(hb1, hb2);
1114out_put_keys: 1098out_put_keys:
1115 put_futex_key(&key2); 1099 put_futex_key(&key2);
@@ -1399,13 +1383,9 @@ retry_private:
1399 /* 1383 /*
1400 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1384 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1401 * be paired with each other and no other futex ops. 1385 * be paired with each other and no other futex ops.
1402 *
1403 * We should never be requeueing a futex_q with a pi_state,
1404 * which is awaiting a futex_unlock_pi().
1405 */ 1386 */
1406 if ((requeue_pi && !this->rt_waiter) || 1387 if ((requeue_pi && !this->rt_waiter) ||
1407 (!requeue_pi && this->rt_waiter) || 1388 (!requeue_pi && this->rt_waiter)) {
1408 this->pi_state) {
1409 ret = -EINVAL; 1389 ret = -EINVAL;
1410 break; 1390 break;
1411 } 1391 }
@@ -1608,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1608 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1588 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1609 struct futex_pi_state *pi_state = q->pi_state; 1589 struct futex_pi_state *pi_state = q->pi_state;
1610 struct task_struct *oldowner = pi_state->owner; 1590 struct task_struct *oldowner = pi_state->owner;
1611 u32 uval, uninitialized_var(curval), newval; 1591 u32 uval, curval, newval;
1612 int ret; 1592 int ret;
1613 1593
1614 /* Owner died? */ 1594 /* Owner died? */
@@ -1825,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1825 * 1805 *
1826 * Returns: 1806 * Returns:
1827 * 0 - uaddr contains val and hb has been locked 1807 * 0 - uaddr contains val and hb has been locked
1828 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 1808 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1829 */ 1809 */
1830static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1810static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1831 struct futex_q *q, struct futex_hash_bucket **hb) 1811 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2250,11 +2230,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2250 * @uaddr2: the pi futex we will take prior to returning to user-space 2230 * @uaddr2: the pi futex we will take prior to returning to user-space
2251 * 2231 *
2252 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2232 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2253 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 2233 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2254 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 2234 * complete the acquisition of the rt_mutex prior to returning to userspace.
2255 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 2235 * This ensures the rt_mutex maintains an owner when it has waiters; without
2256 * without one, the pi logic would not know which task to boost/deboost, if 2236 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2257 * there was a need to. 2237 * need to.
2258 * 2238 *
2259 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2239 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2260 * via the following: 2240 * via the following:
@@ -2291,9 +2271,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2291 struct futex_q q = futex_q_init; 2271 struct futex_q q = futex_q_init;
2292 int res, ret; 2272 int res, ret;
2293 2273
2294 if (uaddr == uaddr2)
2295 return -EINVAL;
2296
2297 if (!bitset) 2274 if (!bitset)
2298 return -EINVAL; 2275 return -EINVAL;
2299 2276
@@ -2365,7 +2342,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2365 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2342 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2366 * the pi_state. 2343 * the pi_state.
2367 */ 2344 */
2368 WARN_ON(!q.pi_state); 2345 WARN_ON(!&q.pi_state);
2369 pi_mutex = &q.pi_state->pi_mutex; 2346 pi_mutex = &q.pi_state->pi_mutex;
2370 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2347 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2371 debug_rt_mutex_free_waiter(&rt_waiter); 2348 debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2392,7 +2369,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2392 * fault, unlock the rt_mutex and return the fault to userspace. 2369 * fault, unlock the rt_mutex and return the fault to userspace.
2393 */ 2370 */
2394 if (ret == -EFAULT) { 2371 if (ret == -EFAULT) {
2395 if (pi_mutex && rt_mutex_owner(pi_mutex) == current) 2372 if (rt_mutex_owner(pi_mutex) == current)
2396 rt_mutex_unlock(pi_mutex); 2373 rt_mutex_unlock(pi_mutex);
2397 } else if (ret == -EINTR) { 2374 } else if (ret == -EINTR) {
2398 /* 2375 /*
@@ -2466,31 +2443,40 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2466{ 2443{
2467 struct robust_list_head __user *head; 2444 struct robust_list_head __user *head;
2468 unsigned long ret; 2445 unsigned long ret;
2469 struct task_struct *p; 2446 const struct cred *cred = current_cred(), *pcred;
2470 2447
2471 if (!futex_cmpxchg_enabled) 2448 if (!futex_cmpxchg_enabled)
2472 return -ENOSYS; 2449 return -ENOSYS;
2473 2450
2474 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2475
2476 rcu_read_lock();
2477
2478 ret = -ESRCH;
2479 if (!pid) 2451 if (!pid)
2480 p = current; 2452 head = current->robust_list;
2481 else { 2453 else {
2454 struct task_struct *p;
2455
2456 ret = -ESRCH;
2457 rcu_read_lock();
2482 p = find_task_by_vpid(pid); 2458 p = find_task_by_vpid(pid);
2483 if (!p) 2459 if (!p)
2484 goto err_unlock; 2460 goto err_unlock;
2461 ret = -EPERM;
2462 pcred = __task_cred(p);
2463 /* If victim is in different user_ns, then uids are not
2464 comparable, so we must have CAP_SYS_PTRACE */
2465 if (cred->user->user_ns != pcred->user->user_ns) {
2466 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2467 goto err_unlock;
2468 goto ok;
2469 }
2470 /* If victim is in same user_ns, then uids are comparable */
2471 if (cred->euid != pcred->euid &&
2472 cred->euid != pcred->uid &&
2473 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2474 goto err_unlock;
2475ok:
2476 head = p->robust_list;
2477 rcu_read_unlock();
2485 } 2478 }
2486 2479
2487 ret = -EPERM;
2488 if (!ptrace_may_access(p, PTRACE_MODE_READ))
2489 goto err_unlock;
2490
2491 head = p->robust_list;
2492 rcu_read_unlock();
2493
2494 if (put_user(sizeof(*head), len_ptr)) 2480 if (put_user(sizeof(*head), len_ptr))
2495 return -EFAULT; 2481 return -EFAULT;
2496 return put_user(head, head_ptr); 2482 return put_user(head, head_ptr);
@@ -2507,7 +2493,7 @@ err_unlock:
2507 */ 2493 */
2508int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2494int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2509{ 2495{
2510 u32 uval, uninitialized_var(nval), mval; 2496 u32 uval, nval, mval;
2511 2497
2512retry: 2498retry:
2513 if (get_user(uval, uaddr)) 2499 if (get_user(uval, uaddr))
@@ -2642,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr)
2642long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2643 u32 __user *uaddr2, u32 val2, u32 val3) 2629 u32 __user *uaddr2, u32 val2, u32 val3)
2644{ 2630{
2645 int cmd = op & FUTEX_CMD_MASK; 2631 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2646 unsigned int flags = 0; 2632 unsigned int flags = 0;
2647 2633
2648 if (!(op & FUTEX_PRIVATE_FLAG)) 2634 if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2655,44 +2641,49 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2655 } 2641 }
2656 2642
2657 switch (cmd) { 2643 switch (cmd) {
2658 case FUTEX_LOCK_PI:
2659 case FUTEX_UNLOCK_PI:
2660 case FUTEX_TRYLOCK_PI:
2661 case FUTEX_WAIT_REQUEUE_PI:
2662 case FUTEX_CMP_REQUEUE_PI:
2663 if (!futex_cmpxchg_enabled)
2664 return -ENOSYS;
2665 }
2666
2667 switch (cmd) {
2668 case FUTEX_WAIT: 2644 case FUTEX_WAIT:
2669 val3 = FUTEX_BITSET_MATCH_ANY; 2645 val3 = FUTEX_BITSET_MATCH_ANY;
2670 case FUTEX_WAIT_BITSET: 2646 case FUTEX_WAIT_BITSET:
2671 return futex_wait(uaddr, flags, val, timeout, val3); 2647 ret = futex_wait(uaddr, flags, val, timeout, val3);
2648 break;
2672 case FUTEX_WAKE: 2649 case FUTEX_WAKE:
2673 val3 = FUTEX_BITSET_MATCH_ANY; 2650 val3 = FUTEX_BITSET_MATCH_ANY;
2674 case FUTEX_WAKE_BITSET: 2651 case FUTEX_WAKE_BITSET:
2675 return futex_wake(uaddr, flags, val, val3); 2652 ret = futex_wake(uaddr, flags, val, val3);
2653 break;
2676 case FUTEX_REQUEUE: 2654 case FUTEX_REQUEUE:
2677 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2655 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656 break;
2678 case FUTEX_CMP_REQUEUE: 2657 case FUTEX_CMP_REQUEUE:
2679 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2658 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2659 break;
2680 case FUTEX_WAKE_OP: 2660 case FUTEX_WAKE_OP:
2681 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2661 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2662 break;
2682 case FUTEX_LOCK_PI: 2663 case FUTEX_LOCK_PI:
2683 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2664 if (futex_cmpxchg_enabled)
2665 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2666 break;
2684 case FUTEX_UNLOCK_PI: 2667 case FUTEX_UNLOCK_PI:
2685 return futex_unlock_pi(uaddr, flags); 2668 if (futex_cmpxchg_enabled)
2669 ret = futex_unlock_pi(uaddr, flags);
2670 break;
2686 case FUTEX_TRYLOCK_PI: 2671 case FUTEX_TRYLOCK_PI:
2687 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2672 if (futex_cmpxchg_enabled)
2673 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2674 break;
2688 case FUTEX_WAIT_REQUEUE_PI: 2675 case FUTEX_WAIT_REQUEUE_PI:
2689 val3 = FUTEX_BITSET_MATCH_ANY; 2676 val3 = FUTEX_BITSET_MATCH_ANY;
2690 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2677 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2691 uaddr2); 2678 uaddr2);
2679 break;
2692 case FUTEX_CMP_REQUEUE_PI: 2680 case FUTEX_CMP_REQUEUE_PI:
2693 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2681 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2682 break;
2683 default:
2684 ret = -ENOSYS;
2694 } 2685 }
2695 return -ENOSYS; 2686 return ret;
2696} 2687}
2697 2688
2698 2689
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005f..5f9e689dc8f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,7 +10,6 @@
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h>
14 13
15#include <asm/uaccess.h> 14#include <asm/uaccess.h>
16 15
@@ -137,31 +136,40 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
137{ 136{
138 struct compat_robust_list_head __user *head; 137 struct compat_robust_list_head __user *head;
139 unsigned long ret; 138 unsigned long ret;
140 struct task_struct *p; 139 const struct cred *cred = current_cred(), *pcred;
141 140
142 if (!futex_cmpxchg_enabled) 141 if (!futex_cmpxchg_enabled)
143 return -ENOSYS; 142 return -ENOSYS;
144 143
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock();
148
149 ret = -ESRCH;
150 if (!pid) 144 if (!pid)
151 p = current; 145 head = current->compat_robust_list;
152 else { 146 else {
147 struct task_struct *p;
148
149 ret = -ESRCH;
150 rcu_read_lock();
153 p = find_task_by_vpid(pid); 151 p = find_task_by_vpid(pid);
154 if (!p) 152 if (!p)
155 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM;
155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
164 if (cred->euid != pcred->euid &&
165 cred->euid != pcred->uid &&
166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
167 goto err_unlock;
168ok:
169 head = p->compat_robust_list;
170 rcu_read_unlock();
156 } 171 }
157 172
158 ret = -EPERM;
159 if (!ptrace_may_access(p, PTRACE_MODE_READ))
160 goto err_unlock;
161
162 head = p->compat_robust_list;
163 rcu_read_unlock();
164
165 if (put_user(sizeof(*head), len_ptr)) 173 if (put_user(sizeof(*head), len_ptr))
166 return -EFAULT; 174 return -EFAULT;
167 return put_user(ptr_to_compat(head), head_ptr); 175 return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc..824b741925b 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL
46 larger and run slower. Also be sure to exclude files from profiling 46 larger and run slower. Also be sure to exclude files from profiling
47 which are not linked to the kernel image to prevent linker errors. 47 which are not linked to the kernel image to prevent linker errors.
48 48
49config GCOV_CTORS
50 string
51 depends on CONSTRUCTORS
52 default ".init_array" if ARM && AEABI
53 default ".ctors"
54
49endmenu 55endmenu
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..d753d1152b7 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter)
297} 297}
298 298
299/* Mapping of logical record number to actual file content. */ 299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0 300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1 301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2 302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3 303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4 304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5 305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6 306#define RECORD_FUNCTION_CHECK_LINE 6
307#define RECORD_COUNT_TAG 7 307#define RECORD_FUNCTION_CHECK_CFG 7
308#define RECORD_COUNT_LEN 8 308#define RECORD_FUNCTION_NAME_LEN 8
309#define RECORD_COUNT 9 309#define RECORD_FUNCTION_NAME 9
310#define RECORD_COUNT_TAG 10
311#define RECORD_COUNT_LEN 11
312#define RECORD_COUNT 12
313
314/* Return length of string encoded in GCOV format. */
315static size_t
316sizeof_str(const char *str)
317{
318 size_t len;
319 len = (str) ? strlen(str) : 0;
320 if (len == 0)
321 return 1;
322 return 1 + ((len + 4) >> 2);
323}
310 324
311/** 325/**
312 * gcov_iter_next - advance file iterator to next logical record 326 * gcov_iter_next - advance file iterator to next logical record
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
323 case RECORD_FUNCTON_TAG_LEN: 337 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT: 338 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG: 339 case RECORD_COUNT_TAG:
340 case RECORD_FUNCTION_CHECK_LINE:
341 case RECORD_FUNCTION_CHECK_CFG:
342 case RECORD_FUNCTION_NAME_LEN:
326 /* Advance to next record */ 343 /* Advance to next record */
327 iter->record++; 344 iter->record++;
328 break; 345 break;
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter)
332 /* fall through */ 349 /* fall through */
333 case RECORD_COUNT_LEN: 350 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) { 351 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9; 352 iter->record = 12;
336 break; 353 break;
337 } 354 }
338 /* Advance to next counter type */ 355 /* Advance to next counter type */
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
340 iter->count = 0; 357 iter->count = 0;
341 iter->type++; 358 iter->type++;
342 /* fall through */ 359 /* fall through */
343 case RECORD_FUNCTION_CHECK: 360 case RECORD_FUNCTION_NAME:
344 if (iter->type < iter->num_types) { 361 if (iter->type < iter->num_types) {
345 iter->record = 7; 362 iter->record = 10;
346 break; 363 break;
347 } 364 }
348 /* Advance to next function */ 365 /* Advance to next function */
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
395 data[1] = (v >> 32); 412 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data)); 413 return seq_write(seq, data, sizeof(data));
397} 414}
415/**
416 * seq_write_gcov_str - write string in gcov format to seq_file
417 * @seq: seq_file handle
418 * @str: string to be stored
419 *
420 * Number format defined by gcc: numbers are recorded in the 32 bit
421 * unsigned binary form of the endianness of the machine generating the
422 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
423 * first.
424 */
425static int seq_write_gcov_str(struct seq_file *seq, const char *str)
426{
427 if (str) {
428 size_t len;
429 int str_off;
430 u32 data;
431 len = strlen(str);
432 for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) {
433 memcpy(&data, (str + str_off * 4), 4);
434 seq_write(seq, &data, sizeof(data));
435 }
436 data = 0;
437 memcpy(&data, (str + str_off * 4), (len - str_off * 4));
438 return seq_write(seq, &data, sizeof(data));
439 } else {
440 return 0;
441 }
442}
398 443
399/** 444/**
400 * gcov_iter_write - write data for current pos to seq_file 445 * gcov_iter_write - write data for current pos to seq_file
@@ -421,13 +466,24 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); 466 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break; 467 break;
423 case RECORD_FUNCTON_TAG_LEN: 468 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2); 469 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH +
470 (sizeof_str(get_func(iter)->name)));
425 break; 471 break;
426 case RECORD_FUNCTION_IDENT: 472 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident); 473 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break; 474 break;
429 case RECORD_FUNCTION_CHECK: 475 case RECORD_FUNCTION_CHECK_LINE:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); 476 rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum);
477 break;
478 case RECORD_FUNCTION_CHECK_CFG:
479 rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum);
480 break;
481 case RECORD_FUNCTION_NAME_LEN:
482 rc = seq_write_gcov_u32(seq,
483 (sizeof_str(get_func(iter)->name) - 1));
484 break;
485 case RECORD_FUNCTION_NAME:
486 rc = seq_write_gcov_str(seq, get_func(iter)->name);
431 break; 487 break;
432 case RECORD_COUNT_TAG: 488 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq, 489 rc = seq_write_gcov_u32(seq,
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a..040c6980df0 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,9 +21,10 @@
21 * gcc and need to be kept as close to the original definition as possible to 21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible. 22 * remain compatible.
23 */ 23 */
24#define GCOV_COUNTERS 5 24#define GCOV_COUNTERS 10
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) 25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) 26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_FUNCTION_LENGTH 3
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) 28#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \ 29#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) 30 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
@@ -34,10 +35,38 @@ typedef long gcov_type;
34typedef long long gcov_type; 35typedef long long gcov_type;
35#endif 36#endif
36 37
38/*
39 * Source module info. The data structure is used in both runtime and
40 * profile-use phase.
41 */
42struct gcov_module_info {
43 unsigned int ident;
44/*
45 * This is overloaded to mean two things:
46 * (1) means FDO/LIPO in instrumented binary.
47 * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use.
48 */
49 unsigned int is_primary;
50 unsigned int is_exported;
51 unsigned int lang;
52 char *da_filename;
53 char *source_filename;
54 unsigned int num_quote_paths;
55 unsigned int num_bracket_paths;
56 unsigned int num_cpp_defines;
57 unsigned int num_cpp_includes;
58 unsigned int num_cl_args;
59 char *string_array[1];
60};
61
62
37/** 63/**
38 * struct gcov_fn_info - profiling meta data per function 64 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier 65 * @ident: object file-unique function identifier
40 * @checksum: function checksum 66 * @lineno_checksum: function lineno checksum
67 * @cfg_checksum: function cfg checksum
68 * @dc_offset: direct call offset
69 * @name: function name
41 * @n_ctrs: number of values per counter type belonging to this function 70 * @n_ctrs: number of values per counter type belonging to this function
42 * 71 *
43 * This data is generated by gcc during compilation and doesn't change 72 * This data is generated by gcc during compilation and doesn't change
@@ -45,7 +74,10 @@ typedef long long gcov_type;
45 */ 74 */
46struct gcov_fn_info { 75struct gcov_fn_info {
47 unsigned int ident; 76 unsigned int ident;
48 unsigned int checksum; 77 unsigned int lineno_checksum;
78 unsigned int cfg_checksum;
79 unsigned int dc_offset;
80 const char *name;
49 unsigned int n_ctrs[0]; 81 unsigned int n_ctrs[0];
50}; 82};
51 83
@@ -67,9 +99,11 @@ struct gcov_ctr_info {
67/** 99/**
68 * struct gcov_info - profiling data per object file 100 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation 101 * @version: gcov version magic indicating the gcc version used for compilation
102 * @modinfo: additional module information
70 * @next: list head for a singly-linked list 103 * @next: list head for a singly-linked list
71 * @stamp: time stamp 104 * @stamp: time stamp
72 * @filename: name of the associated gcov data file 105 * @filename: name of the associated gcov data file
106 * @eof_pos: end position of profile data
73 * @n_functions: number of instrumented functions 107 * @n_functions: number of instrumented functions
74 * @functions: function data 108 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active 109 * @ctr_mask: mask specifying which counter types are active
@@ -80,9 +114,11 @@ struct gcov_ctr_info {
80 */ 114 */
81struct gcov_info { 115struct gcov_info {
82 unsigned int version; 116 unsigned int version;
117 struct gcov_module_info *mod_info;
83 struct gcov_info *next; 118 struct gcov_info *next;
84 unsigned int stamp; 119 unsigned int stamp;
85 const char *filename; 120 const char *filename;
121 unsigned int eof_pos;
86 unsigned int n_functions; 122 unsigned int n_functions;
87 const struct gcov_fn_info *functions; 123 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask; 124 unsigned int ctr_mask;
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04f..1cc476d52dd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
2 * Supplementary group IDs 2 * Supplementary group IDs
3 */ 3 */
4#include <linux/cred.h> 4#include <linux/cred.h>
5#include <linux/export.h> 5#include <linux/module.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
31 group_info->blocks[0] = group_info->small_block; 31 group_info->blocks[0] = group_info->small_block;
32 else { 32 else {
33 for (i = 0; i < nblocks; i++) { 33 for (i = 0; i < nblocks; i++) {
34 kgid_t *b; 34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER); 35 b = (void *)__get_free_page(GFP_USER);
36 if (!b) 36 if (!b)
37 goto out_undo_partial_alloc; 37 goto out_undo_partial_alloc;
@@ -66,15 +66,18 @@ EXPORT_SYMBOL(groups_free);
66static int groups_to_user(gid_t __user *grouplist, 66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info) 67 const struct group_info *group_info)
68{ 68{
69 struct user_namespace *user_ns = current_user_ns();
70 int i; 69 int i;
71 unsigned int count = group_info->ngroups; 70 unsigned int count = group_info->ngroups;
72 71
73 for (i = 0; i < count; i++) { 72 for (i = 0; i < group_info->nblocks; i++) {
74 gid_t gid; 73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
75 gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); 74 unsigned int len = cp_count * sizeof(*grouplist);
76 if (put_user(gid, grouplist+i)) 75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT; 77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
78 } 81 }
79 return 0; 82 return 0;
80} 83}
@@ -83,21 +86,18 @@ static int groups_to_user(gid_t __user *grouplist,
83static int groups_from_user(struct group_info *group_info, 86static int groups_from_user(struct group_info *group_info,
84 gid_t __user *grouplist) 87 gid_t __user *grouplist)
85{ 88{
86 struct user_namespace *user_ns = current_user_ns();
87 int i; 89 int i;
88 unsigned int count = group_info->ngroups; 90 unsigned int count = group_info->ngroups;
89 91
90 for (i = 0; i < count; i++) { 92 for (i = 0; i < group_info->nblocks; i++) {
91 gid_t gid; 93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
92 kgid_t kgid; 94 unsigned int len = cp_count * sizeof(*grouplist);
93 if (get_user(gid, grouplist+i))
94 return -EFAULT;
95 95
96 kgid = make_kgid(user_ns, gid); 96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 if (!gid_valid(kgid)) 97 return -EFAULT;
98 return -EINVAL;
99 98
100 GROUP_AT(group_info, i) = kgid; 99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 } 101 }
102 return 0; 102 return 0;
103} 103}
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
117 for (base = 0; base < max; base++) { 117 for (base = 0; base < max; base++) {
118 int left = base; 118 int left = base;
119 int right = left + stride; 119 int right = left + stride;
120 kgid_t tmp = GROUP_AT(group_info, right); 120 gid_t tmp = GROUP_AT(group_info, right);
121 121
122 while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { 122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) = 123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left); 124 GROUP_AT(group_info, left);
125 right = left; 125 right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
132} 132}
133 133
134/* a simple bsearch */ 134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, kgid_t grp) 135int groups_search(const struct group_info *group_info, gid_t grp)
136{ 136{
137 unsigned int left, right; 137 unsigned int left, right;
138 138
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 if (gid_gt(grp, GROUP_AT(group_info, mid))) 146 if (grp > GROUP_AT(group_info, mid))
147 left = mid + 1; 147 left = mid + 1;
148 else if (gid_lt(grp, GROUP_AT(group_info, mid))) 148 else if (grp < GROUP_AT(group_info, mid))
149 right = mid; 149 right = mid;
150 else 150 else
151 return 1; 151 return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
256/* 256/*
257 * Check whether we're fsgid/egid or in the supplemental group.. 257 * Check whether we're fsgid/egid or in the supplemental group..
258 */ 258 */
259int in_group_p(kgid_t grp) 259int in_group_p(gid_t grp)
260{ 260{
261 const struct cred *cred = current_cred(); 261 const struct cred *cred = current_cred();
262 int retval = 1; 262 int retval = 1;
263 263
264 if (!gid_eq(grp, cred->fsgid)) 264 if (grp != cred->fsgid)
265 retval = groups_search(cred->group_info, grp); 265 retval = groups_search(cred->group_info, grp);
266 return retval; 266 return retval;
267} 267}
268 268
269EXPORT_SYMBOL(in_group_p); 269EXPORT_SYMBOL(in_group_p);
270 270
271int in_egroup_p(kgid_t grp) 271int in_egroup_p(gid_t grp)
272{ 272{
273 const struct cred *cred = current_cred(); 273 const struct cred *cred = current_cred();
274 int retval = 1; 274 int retval = 1;
275 275
276 if (!gid_eq(grp, cred->egid)) 276 if (grp != cred->egid)
277 retval = groups_search(cred->group_info, grp); 277 retval = groups_search(cred->group_info, grp);
278 return retval; 278 return retval;
279} 279}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b..2043c08d36c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/export.h> 35#include <linux/module.h>
36#include <linux/percpu.h> 36#include <linux/percpu.h>
37#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
@@ -657,14 +657,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
657 return 0; 657 return 0;
658} 658}
659 659
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
661{
662 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
663 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
664
665 return ktime_get_update_offsets(offs_real, offs_boot);
666}
667
668/* 660/*
669 * Retrigger next event is called after clock was set 661 * Retrigger next event is called after clock was set
670 * 662 *
@@ -673,12 +665,22 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
673static void retrigger_next_event(void *arg) 665static void retrigger_next_event(void *arg)
674{ 666{
675 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
676 669
677 if (!hrtimer_hres_active()) 670 if (!hrtimer_hres_active())
678 return; 671 return;
679 672
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
680 raw_spin_lock(&base->lock); 678 raw_spin_lock(&base->lock);
681 hrtimer_update_base(base); 679 base->clock_base[HRTIMER_BASE_REALTIME].offset =
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
682 hrtimer_force_reprogram(base, 0); 684 hrtimer_force_reprogram(base, 0);
683 raw_spin_unlock(&base->lock); 685 raw_spin_unlock(&base->lock);
684} 686}
@@ -708,25 +710,13 @@ static int hrtimer_switch_to_hres(void)
708 base->clock_base[i].resolution = KTIME_HIGH_RES; 710 base->clock_base[i].resolution = KTIME_HIGH_RES;
709 711
710 tick_setup_sched_timer(); 712 tick_setup_sched_timer();
713
711 /* "Retrigger" the interrupt to get things going */ 714 /* "Retrigger" the interrupt to get things going */
712 retrigger_next_event(NULL); 715 retrigger_next_event(NULL);
713 local_irq_restore(flags); 716 local_irq_restore(flags);
714 return 1; 717 return 1;
715} 718}
716 719
717/*
718 * Called from timekeeping code to reprogramm the hrtimer interrupt
719 * device. If called from the timer interrupt context we defer it to
720 * softirq context.
721 */
722void clock_was_set_delayed(void)
723{
724 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
725
726 cpu_base->clock_was_set = 1;
727 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
728}
729
730#else 720#else
731 721
732static inline int hrtimer_hres_active(void) { return 0; } 722static inline int hrtimer_hres_active(void) { return 0; }
@@ -1260,10 +1250,11 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1260 cpu_base->nr_events++; 1250 cpu_base->nr_events++;
1261 dev->next_event.tv64 = KTIME_MAX; 1251 dev->next_event.tv64 = KTIME_MAX;
1262 1252
1263 raw_spin_lock(&cpu_base->lock); 1253 entry_time = now = ktime_get();
1264 entry_time = now = hrtimer_update_base(cpu_base);
1265retry: 1254retry:
1266 expires_next.tv64 = KTIME_MAX; 1255 expires_next.tv64 = KTIME_MAX;
1256
1257 raw_spin_lock(&cpu_base->lock);
1267 /* 1258 /*
1268 * We set expires_next to KTIME_MAX here with cpu_base->lock 1259 * We set expires_next to KTIME_MAX here with cpu_base->lock
1269 * held to prevent that a timer is enqueued in our queue via 1260 * held to prevent that a timer is enqueued in our queue via
@@ -1339,12 +1330,8 @@ retry:
1339 * We need to prevent that we loop forever in the hrtimer 1330 * We need to prevent that we loop forever in the hrtimer
1340 * interrupt routine. We give it 3 attempts to avoid 1331 * interrupt routine. We give it 3 attempts to avoid
1341 * overreacting on some spurious event. 1332 * overreacting on some spurious event.
1342 *
1343 * Acquire base lock for updating the offsets and retrieving
1344 * the current time.
1345 */ 1333 */
1346 raw_spin_lock(&cpu_base->lock); 1334 now = ktime_get();
1347 now = hrtimer_update_base(cpu_base);
1348 cpu_base->nr_retries++; 1335 cpu_base->nr_retries++;
1349 if (++retries < 3) 1336 if (++retries < 3)
1350 goto retry; 1337 goto retry;
@@ -1356,7 +1343,6 @@ retry:
1356 */ 1343 */
1357 cpu_base->nr_hangs++; 1344 cpu_base->nr_hangs++;
1358 cpu_base->hang_detected = 1; 1345 cpu_base->hang_detected = 1;
1359 raw_spin_unlock(&cpu_base->lock);
1360 delta = ktime_sub(now, entry_time); 1346 delta = ktime_sub(now, entry_time);
1361 if (delta.tv64 > cpu_base->max_hang_time.tv64) 1347 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1362 cpu_base->max_hang_time = delta; 1348 cpu_base->max_hang_time = delta;
@@ -1409,13 +1395,6 @@ void hrtimer_peek_ahead_timers(void)
1409 1395
1410static void run_hrtimer_softirq(struct softirq_action *h) 1396static void run_hrtimer_softirq(struct softirq_action *h)
1411{ 1397{
1412 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1413
1414 if (cpu_base->clock_was_set) {
1415 cpu_base->clock_was_set = 0;
1416 clock_was_set();
1417 }
1418
1419 hrtimer_peek_ahead_timers(); 1398 hrtimer_peek_ahead_timers();
1420} 1399}
1421 1400
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9..e972276f12f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
13#include <linux/freezer.h> 13#include <linux/freezer.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/export.h> 16#include <linux/module.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18 18
19/* 19/*
@@ -108,10 +108,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
108 108
109 touch_nmi_watchdog(); 109 touch_nmi_watchdog();
110 110
111 if (sysctl_hung_task_panic) { 111 if (sysctl_hung_task_panic)
112 trigger_all_cpu_backtrace();
113 panic("hung_task: blocked tasks"); 112 panic("hung_task: blocked tasks");
114 }
115} 113}
116 114
117/* 115/*
@@ -121,20 +119,15 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
121 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 119 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
122 * to exit the grace period. For classic RCU, a reschedule is required. 120 * to exit the grace period. For classic RCU, a reschedule is required.
123 */ 121 */
124static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) 122static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
125{ 123{
126 bool can_cont;
127
128 get_task_struct(g); 124 get_task_struct(g);
129 get_task_struct(t); 125 get_task_struct(t);
130 rcu_read_unlock(); 126 rcu_read_unlock();
131 cond_resched(); 127 cond_resched();
132 rcu_read_lock(); 128 rcu_read_lock();
133 can_cont = pid_alive(g) && pid_alive(t);
134 put_task_struct(t); 129 put_task_struct(t);
135 put_task_struct(g); 130 put_task_struct(g);
136
137 return can_cont;
138} 131}
139 132
140/* 133/*
@@ -161,7 +154,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
161 goto unlock; 154 goto unlock;
162 if (!--batch_count) { 155 if (!--batch_count) {
163 batch_count = HUNG_TASK_BATCHING; 156 batch_count = HUNG_TASK_BATCHING;
164 if (!rcu_lock_break(g, t)) 157 rcu_lock_break(g, t);
158 /* Exit if t or g was unhashed during refresh. */
159 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
165 goto unlock; 160 goto unlock;
166 } 161 }
167 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ 162 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972..5a38bf4de64 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
13# Options selectable by the architecture code 13# Options selectable by the architecture code
14 14
15# Make sparse irq Kconfig switch below available 15# Make sparse irq Kconfig switch below available
16config MAY_HAVE_SPARSE_IRQ 16config HAVE_SPARSE_IRQ
17 bool 17 bool
18 18
19# Enable the generic irq autoprobe mechanism 19# Enable the generic irq autoprobe mechanism
@@ -56,22 +56,13 @@ config GENERIC_IRQ_CHIP
56config IRQ_DOMAIN 56config IRQ_DOMAIN
57 bool 57 bool
58 58
59config IRQ_DOMAIN_DEBUG
60 bool "Expose hardware/virtual IRQ mapping via debugfs"
61 depends on IRQ_DOMAIN && DEBUG_FS
62 help
63 This option will show the mapping relationship between hardware irq
64 numbers and Linux irq numbers. The mapping is exposed via debugfs
65 in the file "irq_domain_mapping".
66
67 If you don't know what this means you don't need it.
68
69# Support forced irq threading 59# Support forced irq threading
70config IRQ_FORCED_THREADING 60config IRQ_FORCED_THREADING
71 bool 61 bool
72 62
73config SPARSE_IRQ 63config SPARSE_IRQ
74 bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ 64 bool "Support sparse irq numbering"
65 depends on HAVE_SPARSE_IRQ
75 ---help--- 66 ---help---
76 67
77 Sparse irq numbering is useful for distro kernels that want 68 Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 0119b9d467a..342d8f44e40 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
53 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
54 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
55 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc, false); 56 irq_startup(desc);
57 } 57 }
58 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
59 } 59 }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc, false)) 73 if (irq_startup(desc))
74 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
75 } 75 }
76 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30..dc5114b4c16 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,8 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18 18
19#include <trace/events/irq.h>
20
21#include "internals.h" 19#include "internals.h"
22 20
23/** 21/**
@@ -28,7 +26,7 @@
28int irq_set_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
29{ 27{
30 unsigned long flags; 28 unsigned long flags;
31 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); 29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
32 30
33 if (!desc) 31 if (!desc)
34 return -EINVAL; 32 return -EINVAL;
@@ -56,14 +54,15 @@ EXPORT_SYMBOL(irq_set_chip);
56int irq_set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
57{ 55{
58 unsigned long flags; 56 unsigned long flags;
59 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
60 int ret = 0; 58 int ret = 0;
61 59
62 if (!desc) 60 if (!desc)
63 return -EINVAL; 61 return -EINVAL;
64 62
65 type &= IRQ_TYPE_SENSE_MASK; 63 type &= IRQ_TYPE_SENSE_MASK;
66 ret = __irq_set_trigger(desc, irq, type); 64 if (type != IRQ_TYPE_NONE)
65 ret = __irq_set_trigger(desc, irq, type);
67 irq_put_desc_busunlock(desc, flags); 66 irq_put_desc_busunlock(desc, flags);
68 return ret; 67 return ret;
69} 68}
@@ -79,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
79int irq_set_handler_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
80{ 79{
81 unsigned long flags; 80 unsigned long flags;
82 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); 81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
83 82
84 if (!desc) 83 if (!desc)
85 return -EINVAL; 84 return -EINVAL;
@@ -99,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
99int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
100{ 99{
101 unsigned long flags; 100 unsigned long flags;
102 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
103 102
104 if (!desc) 103 if (!desc)
105 return -EINVAL; 104 return -EINVAL;
@@ -120,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
120int irq_set_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
121{ 120{
122 unsigned long flags; 121 unsigned long flags;
123 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); 122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
124 123
125 if (!desc) 124 if (!desc)
126 return -EINVAL; 125 return -EINVAL;
@@ -158,22 +157,19 @@ static void irq_state_set_masked(struct irq_desc *desc)
158 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); 157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
159} 158}
160 159
161int irq_startup(struct irq_desc *desc, bool resend) 160int irq_startup(struct irq_desc *desc)
162{ 161{
163 int ret = 0;
164
165 irq_state_clr_disabled(desc); 162 irq_state_clr_disabled(desc);
166 desc->depth = 0; 163 desc->depth = 0;
167 164
168 if (desc->irq_data.chip->irq_startup) { 165 if (desc->irq_data.chip->irq_startup) {
169 ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
170 irq_state_clr_masked(desc); 167 irq_state_clr_masked(desc);
171 } else { 168 return ret;
172 irq_enable(desc);
173 } 169 }
174 if (resend) 170
175 check_irq_resend(desc, desc->irq_data.irq); 171 irq_enable(desc);
176 return ret; 172 return 0;
177} 173}
178 174
179void irq_shutdown(struct irq_desc *desc) 175void irq_shutdown(struct irq_desc *desc)
@@ -208,24 +204,6 @@ void irq_disable(struct irq_desc *desc)
208 } 204 }
209} 205}
210 206
211void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
212{
213 if (desc->irq_data.chip->irq_enable)
214 desc->irq_data.chip->irq_enable(&desc->irq_data);
215 else
216 desc->irq_data.chip->irq_unmask(&desc->irq_data);
217 cpumask_set_cpu(cpu, desc->percpu_enabled);
218}
219
220void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
221{
222 if (desc->irq_data.chip->irq_disable)
223 desc->irq_data.chip->irq_disable(&desc->irq_data);
224 else
225 desc->irq_data.chip->irq_mask(&desc->irq_data);
226 cpumask_clear_cpu(cpu, desc->percpu_enabled);
227}
228
229static inline void mask_ack_irq(struct irq_desc *desc) 207static inline void mask_ack_irq(struct irq_desc *desc)
230{ 208{
231 if (desc->irq_data.chip->irq_mask_ack) 209 if (desc->irq_data.chip->irq_mask_ack)
@@ -272,14 +250,11 @@ void handle_nested_irq(unsigned int irq)
272 250
273 raw_spin_lock_irq(&desc->lock); 251 raw_spin_lock_irq(&desc->lock);
274 252
275 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
276 kstat_incr_irqs_this_cpu(irq, desc); 253 kstat_incr_irqs_this_cpu(irq, desc);
277 254
278 action = desc->action; 255 action = desc->action;
279 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { 256 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
280 desc->istate |= IRQS_PENDING;
281 goto out_unlock; 257 goto out_unlock;
282 }
283 258
284 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 259 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
285 raw_spin_unlock_irq(&desc->lock); 260 raw_spin_unlock_irq(&desc->lock);
@@ -327,10 +302,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
327 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 302 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
328 kstat_incr_irqs_this_cpu(irq, desc); 303 kstat_incr_irqs_this_cpu(irq, desc);
329 304
330 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { 305 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
331 desc->istate |= IRQS_PENDING;
332 goto out_unlock; 306 goto out_unlock;
333 }
334 307
335 handle_irq_event(desc); 308 handle_irq_event(desc);
336 309
@@ -339,24 +312,6 @@ out_unlock:
339} 312}
340EXPORT_SYMBOL_GPL(handle_simple_irq); 313EXPORT_SYMBOL_GPL(handle_simple_irq);
341 314
342/*
343 * Called unconditionally from handle_level_irq() and only for oneshot
344 * interrupts from handle_fasteoi_irq()
345 */
346static void cond_unmask_irq(struct irq_desc *desc)
347{
348 /*
349 * We need to unmask in the following cases:
350 * - Standard level irq (IRQF_ONESHOT is not set)
351 * - Oneshot irq which did not wake the thread (caused by a
352 * spurious interrupt or a primary handler handling it
353 * completely).
354 */
355 if (!irqd_irq_disabled(&desc->irq_data) &&
356 irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
357 unmask_irq(desc);
358}
359
360/** 315/**
361 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
362 * @irq: the interrupt number 317 * @irq: the interrupt number
@@ -384,15 +339,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
384 * If its disabled or no action available 339 * If its disabled or no action available
385 * keep it masked and get out of here 340 * keep it masked and get out of here
386 */ 341 */
387 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { 342 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
388 desc->istate |= IRQS_PENDING;
389 goto out_unlock; 343 goto out_unlock;
390 }
391 344
392 handle_irq_event(desc); 345 handle_irq_event(desc);
393 346
394 cond_unmask_irq(desc); 347 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
395 348 unmask_irq(desc);
396out_unlock: 349out_unlock:
397 raw_spin_unlock(&desc->lock); 350 raw_spin_unlock(&desc->lock);
398} 351}
@@ -446,9 +399,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
446 preflow_handler(desc); 399 preflow_handler(desc);
447 handle_irq_event(desc); 400 handle_irq_event(desc);
448 401
449 if (desc->istate & IRQS_ONESHOT)
450 cond_unmask_irq(desc);
451
452out_eoi: 402out_eoi:
453 desc->irq_data.chip->irq_eoi(&desc->irq_data); 403 desc->irq_data.chip->irq_eoi(&desc->irq_data);
454out_unlock: 404out_unlock:
@@ -525,7 +475,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
525out_unlock: 475out_unlock:
526 raw_spin_unlock(&desc->lock); 476 raw_spin_unlock(&desc->lock);
527} 477}
528EXPORT_SYMBOL(handle_edge_irq);
529 478
530#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER 479#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
531/** 480/**
@@ -595,44 +544,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
595 chip->irq_eoi(&desc->irq_data); 544 chip->irq_eoi(&desc->irq_data);
596} 545}
597 546
598/**
599 * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
600 * @irq: the interrupt number
601 * @desc: the interrupt description structure for this irq
602 *
603 * Per CPU interrupts on SMP machines without locking requirements. Same as
604 * handle_percpu_irq() above but with the following extras:
605 *
606 * action->percpu_dev_id is a pointer to percpu variables which
607 * contain the real device id for the cpu on which this handler is
608 * called
609 */
610void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
611{
612 struct irq_chip *chip = irq_desc_get_chip(desc);
613 struct irqaction *action = desc->action;
614 void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
615 irqreturn_t res;
616
617 kstat_incr_irqs_this_cpu(irq, desc);
618
619 if (chip->irq_ack)
620 chip->irq_ack(&desc->irq_data);
621
622 trace_irq_handler_entry(irq, action);
623 res = action->handler(irq, dev_id);
624 trace_irq_handler_exit(irq, action, res);
625
626 if (chip->irq_eoi)
627 chip->irq_eoi(&desc->irq_data);
628}
629
630void 547void
631__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 548__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
632 const char *name) 549 const char *name)
633{ 550{
634 unsigned long flags; 551 unsigned long flags;
635 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); 552 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
636 553
637 if (!desc) 554 if (!desc)
638 return; 555 return;
@@ -658,7 +575,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
658 irq_settings_set_noprobe(desc); 575 irq_settings_set_noprobe(desc);
659 irq_settings_set_norequest(desc); 576 irq_settings_set_norequest(desc);
660 irq_settings_set_nothread(desc); 577 irq_settings_set_nothread(desc);
661 irq_startup(desc, true); 578 irq_startup(desc);
662 } 579 }
663out: 580out:
664 irq_put_desc_busunlock(desc, flags); 581 irq_put_desc_busunlock(desc, flags);
@@ -672,12 +589,11 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
672 irq_set_chip(irq, chip); 589 irq_set_chip(irq, chip);
673 __irq_set_handler(irq, handle, 0, name); 590 __irq_set_handler(irq, handle, 0, name);
674} 591}
675EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
676 592
677void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 593void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
678{ 594{
679 unsigned long flags; 595 unsigned long flags;
680 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); 596 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
681 597
682 if (!desc) 598 if (!desc)
683 return; 599 return;
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e75e29e4434..97a8bfadc88 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
4 4
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6 6
7#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) 7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) 8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */ 9/* FIXME */
10#define ___PD(f) do { } while (0) 10#define PD(f) do { } while (0)
11 11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{ 13{
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
23 print_symbol("%s\n", (unsigned long)desc->action->handler); 23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 } 24 }
25 25
26 ___P(IRQ_LEVEL); 26 P(IRQ_LEVEL);
27 ___P(IRQ_PER_CPU); 27 P(IRQ_PER_CPU);
28 ___P(IRQ_NOPROBE); 28 P(IRQ_NOPROBE);
29 ___P(IRQ_NOREQUEST); 29 P(IRQ_NOREQUEST);
30 ___P(IRQ_NOTHREAD); 30 P(IRQ_NOTHREAD);
31 ___P(IRQ_NOAUTOEN); 31 P(IRQ_NOAUTOEN);
32 32
33 ___PS(IRQS_AUTODETECT); 33 PS(IRQS_AUTODETECT);
34 ___PS(IRQS_REPLAY); 34 PS(IRQS_REPLAY);
35 ___PS(IRQS_WAITING); 35 PS(IRQS_WAITING);
36 ___PS(IRQS_PENDING); 36 PS(IRQS_PENDING);
37 37
38 ___PD(IRQS_INPROGRESS); 38 PD(IRQS_INPROGRESS);
39 ___PD(IRQS_DISABLED); 39 PD(IRQS_DISABLED);
40 ___PD(IRQS_MASKED); 40 PD(IRQS_MASKED);
41} 41}
42 42
43#undef ___P 43#undef P
44#undef ___PS 44#undef PS
45#undef ___PD 45#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e884..b5fcd96c710 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/export.h>
10 9
11#include "internals.h" 10#include "internals.h"
12 11
@@ -58,4 +57,3 @@ struct irq_chip dummy_irq_chip = {
58 .irq_mask = noop, 57 .irq_mask = noop,
59 .irq_unmask = noop, 58 .irq_unmask = noop,
60}; 59};
61EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f66..e38544dddb1 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,7 +6,6 @@
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
12#include <linux/syscore_ops.h> 11#include <linux/syscore_ops.h>
@@ -212,7 +211,6 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
212 } 211 }
213 return gc; 212 return gc;
214} 213}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
216 214
217/* 215/*
218 * Separate lockdep class for interrupt chip which can nest irq_desc 216 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -260,7 +258,6 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
260 } 258 }
261 gc->irq_cnt = i - gc->irq_base; 259 gc->irq_cnt = i - gc->irq_base;
262} 260}
263EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
264 261
265/** 262/**
266 * irq_setup_alt_chip - Switch to alternative chip 263 * irq_setup_alt_chip - Switch to alternative chip
@@ -284,7 +281,6 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
284 } 281 }
285 return -EINVAL; 282 return -EINVAL;
286} 283}
287EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
288 284
289/** 285/**
290 * irq_remove_generic_chip - Remove a chip 286 * irq_remove_generic_chip - Remove a chip
@@ -315,7 +311,6 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
315 irq_modify_status(i, clr, set); 311 irq_modify_status(i, clr, set);
316 } 312 }
317} 313}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
319 314
320#ifdef CONFIG_PM 315#ifdef CONFIG_PM
321static int irq_gc_suspend(void) 316static int irq_gc_suspend(void)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b49..470d08c82bb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,18 +54,14 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55{ 55{
56 /* 56 /*
57 * In case the thread crashed and was killed we just pretend that 57 * Wake up the handler thread for this action. In case the
58 * we handled the interrupt. The hardirq handler has disabled the 58 * thread crashed and was killed we just pretend that we
59 * device interrupt, so no irq storm is lurking. 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61 if (action->thread->flags & PF_EXITING)
62 return;
63
64 /*
65 * Wake up the handler thread for this action. If the
66 * RUNTHREAD bit is already set, nothing to do. 61 * RUNTHREAD bit is already set, nothing to do.
67 */ 62 */
68 if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) 63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
69 return; 65 return;
70 66
71 /* 67 /*
@@ -114,18 +110,6 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
114 * threads_oneshot untouched and runs the thread another time. 110 * threads_oneshot untouched and runs the thread another time.
115 */ 111 */
116 desc->threads_oneshot |= action->thread_mask; 112 desc->threads_oneshot |= action->thread_mask;
117
118 /*
119 * We increment the threads_active counter in case we wake up
120 * the irq thread. The irq thread decrements the counter when
121 * it returns from the handler or in the exit path and wakes
122 * up waiters which are stuck in synchronize_irq() when the
123 * active count becomes zero. synchronize_irq() is serialized
124 * against this code (hard irq handler) via IRQS_INPROGRESS
125 * like the finalize_oneshot() code. See comment above.
126 */
127 atomic_inc(&desc->threads_active);
128
129 wake_up_process(action->thread); 113 wake_up_process(action->thread);
130} 114}
131 115
@@ -133,7 +117,7 @@ irqreturn_t
133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) 117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134{ 118{
135 irqreturn_t retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
136 unsigned int flags = 0, irq = desc->irq_data.irq; 120 unsigned int random = 0, irq = desc->irq_data.irq;
137 121
138 do { 122 do {
139 irqreturn_t res; 123 irqreturn_t res;
@@ -161,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
161 145
162 /* Fall through to add to randomness */ 146 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 147 case IRQ_HANDLED:
164 flags |= action->flags; 148 random |= action->flags;
165 break; 149 break;
166 150
167 default: 151 default:
@@ -172,7 +156,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
172 action = action->next; 156 action = action->next;
173 } while (action); 157 } while (action);
174 158
175 add_interrupt_randomness(irq, flags); 159 if (random & IRQF_SAMPLE_RANDOM)
160 add_interrupt_randomness(irq);
176 161
177 if (!noirqdebug) 162 if (!noirqdebug)
178 note_interrupt(irq, desc, retval); 163 note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 001fa5bab49..6546431447d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,17 +15,19 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18extern bool noirqdebug; 18extern int noirqdebug;
19 19
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run 22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
23 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed 24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
24 * IRQTF_AFFINITY - irq thread is requested to adjust affinity 25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
25 * IRQTF_FORCED_THREAD - irq action is force threaded 26 * IRQTF_FORCED_THREAD - irq action is force threaded
26 */ 27 */
27enum { 28enum {
28 IRQTF_RUNTHREAD, 29 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
29 IRQTF_WARNED, 31 IRQTF_WARNED,
30 IRQTF_AFFINITY, 32 IRQTF_AFFINITY,
31 IRQTF_FORCED_THREAD, 33 IRQTF_FORCED_THREAD,
@@ -65,12 +67,10 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
65extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
66extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
67 69
68extern int irq_startup(struct irq_desc *desc, bool resend); 70extern int irq_startup(struct irq_desc *desc);
69extern void irq_shutdown(struct irq_desc *desc); 71extern void irq_shutdown(struct irq_desc *desc);
70extern void irq_enable(struct irq_desc *desc); 72extern void irq_enable(struct irq_desc *desc);
71extern void irq_disable(struct irq_desc *desc); 73extern void irq_disable(struct irq_desc *desc);
72extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
73extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
74extern void mask_irq(struct irq_desc *desc); 74extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc); 75extern void unmask_irq(struct irq_desc *desc);
76 76
@@ -101,9 +101,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
101 101
102extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
103 103
104extern int irq_do_set_affinity(struct irq_data *data,
105 const struct cpumask *dest, bool force);
106
107/* Inline functions for support of irq chips on slow busses */ 104/* Inline functions for support of irq chips on slow busses */
108static inline void chip_bus_lock(struct irq_desc *desc) 105static inline void chip_bus_lock(struct irq_desc *desc)
109{ 106{
@@ -117,21 +114,14 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
117 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
118} 115}
119 116
120#define _IRQ_DESC_CHECK (1 << 0)
121#define _IRQ_DESC_PERCPU (1 << 1)
122
123#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
124#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
125
126struct irq_desc * 117struct irq_desc *
127__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, 118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
128 unsigned int check);
129void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); 119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
130 120
131static inline struct irq_desc * 121static inline struct irq_desc *
132irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) 122irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
133{ 123{
134 return __irq_get_desc_lock(irq, flags, true, check); 124 return __irq_get_desc_lock(irq, flags, true);
135} 125}
136 126
137static inline void 127static inline void
@@ -141,9 +131,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
141} 131}
142 132
143static inline struct irq_desc * 133static inline struct irq_desc *
144irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) 134irq_get_desc_lock(unsigned int irq, unsigned long *flags)
145{ 135{
146 return __irq_get_desc_lock(irq, flags, false, check); 136 return __irq_get_desc_lock(irq, flags, false);
147} 137}
148 138
149static inline void 139static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cf..039b889ea05 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
9 */ 9 */
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/export.h> 12#include <linux/module.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
@@ -112,7 +112,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
112{ 112{
113 return radix_tree_lookup(&irq_desc_tree, irq); 113 return radix_tree_lookup(&irq_desc_tree, irq);
114} 114}
115EXPORT_SYMBOL(irq_to_desc);
116 115
117static void delete_irq_desc(unsigned int irq) 116static void delete_irq_desc(unsigned int irq)
118{ 117{
@@ -425,22 +424,11 @@ unsigned int irq_get_next_irq(unsigned int offset)
425} 424}
426 425
427struct irq_desc * 426struct irq_desc *
428__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, 427__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
429 unsigned int check)
430{ 428{
431 struct irq_desc *desc = irq_to_desc(irq); 429 struct irq_desc *desc = irq_to_desc(irq);
432 430
433 if (desc) { 431 if (desc) {
434 if (check & _IRQ_DESC_CHECK) {
435 if ((check & _IRQ_DESC_PERCPU) &&
436 !irq_settings_is_per_cpu_devid(desc))
437 return NULL;
438
439 if (!(check & _IRQ_DESC_PERCPU) &&
440 irq_settings_is_per_cpu_devid(desc))
441 return NULL;
442 }
443
444 if (bus) 432 if (bus)
445 chip_bus_lock(desc); 433 chip_bus_lock(desc);
446 raw_spin_lock_irqsave(&desc->lock, *flags); 434 raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -455,25 +443,6 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
455 chip_bus_sync_unlock(desc); 443 chip_bus_sync_unlock(desc);
456} 444}
457 445
458int irq_set_percpu_devid(unsigned int irq)
459{
460 struct irq_desc *desc = irq_to_desc(irq);
461
462 if (!desc)
463 return -EINVAL;
464
465 if (desc->percpu_enabled)
466 return -EINVAL;
467
468 desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
469
470 if (!desc->percpu_enabled)
471 return -ENOMEM;
472
473 irq_set_percpu_devid_flags(irq);
474 return 0;
475}
476
477/** 446/**
478 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 447 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
479 * @irq: irq number to initialize 448 * @irq: irq number to initialize
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c37..b57a3776de4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,925 +1,184 @@
1#define pr_fmt(fmt) "irq: " fmt
2
3#include <linux/debugfs.h>
4#include <linux/hardirq.h>
5#include <linux/interrupt.h>
6#include <linux/irq.h> 1#include <linux/irq.h>
7#include <linux/irqdesc.h>
8#include <linux/irqdomain.h> 2#include <linux/irqdomain.h>
9#include <linux/module.h> 3#include <linux/module.h>
10#include <linux/mutex.h> 4#include <linux/mutex.h>
11#include <linux/of.h> 5#include <linux/of.h>
12#include <linux/of_address.h> 6#include <linux/of_address.h>
13#include <linux/topology.h>
14#include <linux/seq_file.h>
15#include <linux/slab.h> 7#include <linux/slab.h>
16#include <linux/smp.h>
17#include <linux/fs.h>
18
19#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
20 * ie. legacy 8259, gets irqs 1..15 */
21#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
22#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
23#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
24 8
25static LIST_HEAD(irq_domain_list); 9static LIST_HEAD(irq_domain_list);
26static DEFINE_MUTEX(irq_domain_mutex); 10static DEFINE_MUTEX(irq_domain_mutex);
27 11
28static DEFINE_MUTEX(revmap_trees_mutex);
29static struct irq_domain *irq_default_domain;
30
31/** 12/**
32 * irq_domain_alloc() - Allocate a new irq_domain data structure 13 * irq_domain_add() - Register an irq_domain
33 * @of_node: optional device-tree node of the interrupt controller 14 * @domain: ptr to initialized irq_domain structure
34 * @revmap_type: type of reverse mapping to use
35 * @ops: map/unmap domain callbacks
36 * @host_data: Controller private data pointer
37 * 15 *
38 * Allocates and initialize and irq_domain structure. Caller is expected to 16 * Registers an irq_domain structure. The irq_domain must at a minimum be
39 * register allocated irq_domain with irq_domain_register(). Returns pointer 17 * initialized with an ops structure pointer, and either a ->to_irq hook or
40 * to IRQ domain, or NULL on failure. 18 * a valid irq_base value. Everything else is optional.
41 */ 19 */
42static struct irq_domain *irq_domain_alloc(struct device_node *of_node, 20void irq_domain_add(struct irq_domain *domain)
43 unsigned int revmap_type,
44 const struct irq_domain_ops *ops,
45 void *host_data)
46{
47 struct irq_domain *domain;
48
49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
50 of_node_to_nid(of_node));
51 if (WARN_ON(!domain))
52 return NULL;
53
54 /* Fill structure */
55 domain->revmap_type = revmap_type;
56 domain->ops = ops;
57 domain->host_data = host_data;
58 domain->of_node = of_node_get(of_node);
59
60 return domain;
61}
62
63static void irq_domain_free(struct irq_domain *domain)
64{ 21{
65 of_node_put(domain->of_node); 22 struct irq_data *d;
66 kfree(domain); 23 int hwirq;
67}
68
69static void irq_domain_add(struct irq_domain *domain)
70{
71 mutex_lock(&irq_domain_mutex);
72 list_add(&domain->link, &irq_domain_list);
73 mutex_unlock(&irq_domain_mutex);
74 pr_debug("Allocated domain of type %d @0x%p\n",
75 domain->revmap_type, domain);
76}
77
78/**
79 * irq_domain_remove() - Remove an irq domain.
80 * @domain: domain to remove
81 *
82 * This routine is used to remove an irq domain. The caller must ensure
83 * that all mappings within the domain have been disposed of prior to
84 * use, depending on the revmap type.
85 */
86void irq_domain_remove(struct irq_domain *domain)
87{
88 mutex_lock(&irq_domain_mutex);
89
90 switch (domain->revmap_type) {
91 case IRQ_DOMAIN_MAP_LEGACY:
92 /*
93 * Legacy domains don't manage their own irq_desc
94 * allocations, we expect the caller to handle irq_desc
95 * freeing on their own.
96 */
97 break;
98 case IRQ_DOMAIN_MAP_TREE:
99 /*
100 * radix_tree_delete() takes care of destroying the root
101 * node when all entries are removed. Shout if there are
102 * any mappings left.
103 */
104 WARN_ON(domain->revmap_data.tree.height);
105 break;
106 case IRQ_DOMAIN_MAP_LINEAR:
107 kfree(domain->revmap_data.linear.revmap);
108 domain->revmap_data.linear.size = 0;
109 break;
110 case IRQ_DOMAIN_MAP_NOMAP:
111 break;
112 }
113
114 list_del(&domain->link);
115 24
116 /* 25 /*
117 * If the going away domain is the default one, reset it. 26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain.
118 */ 29 */
119 if (unlikely(irq_default_domain == domain)) 30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
120 irq_set_default_host(NULL); 31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
121 32 if (!d) {
122 mutex_unlock(&irq_domain_mutex); 33 WARN(1, "error: assigning domain to non existant irq_desc");
123 34 return;
124 pr_debug("Removed domain of type %d @0x%p\n", 35 }
125 domain->revmap_type, domain); 36 if (d->domain) {
126 37 /* things are broken; just report, don't clean up */
127 irq_domain_free(domain); 38 WARN(1, "error: irq_desc already assigned to a domain");
128} 39 return;
129EXPORT_SYMBOL_GPL(irq_domain_remove);
130
131static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
132 irq_hw_number_t hwirq)
133{
134 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
135 int size = domain->revmap_data.legacy.size;
136
137 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
138 return 0;
139 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
140}
141
142/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain
147 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer
149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise. For the legacy domain, IRQ descriptors will also
152 * be allocated.
153 *
154 * This is intended to implement the expected behaviour for most
155 * interrupt controllers which is that a linear mapping should
156 * normally be used unless the system requires a legacy mapping in
157 * order to support supplying interrupt numbers during non-DT
158 * registration of devices.
159 */
160struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
161 unsigned int size,
162 unsigned int first_irq,
163 const struct irq_domain_ops *ops,
164 void *host_data)
165{
166 if (first_irq > 0) {
167 int irq_base;
168
169 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
170 /*
171 * Set the descriptor allocator to search for a
172 * 1-to-1 mapping, such as irq_alloc_desc_at().
173 * Use of_node_to_nid() which is defined to
174 * numa_node_id() on platforms that have no custom
175 * implementation.
176 */
177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node));
179 if (irq_base < 0) {
180 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq);
182 irq_base = first_irq;
183 }
184 } else
185 irq_base = first_irq;
186
187 return irq_domain_add_legacy(of_node, size, irq_base, 0,
188 ops, host_data);
189 }
190
191 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data);
193}
194
195/**
196 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
197 * @of_node: pointer to interrupt controller's device tree node.
198 * @size: total number of irqs in legacy mapping
199 * @first_irq: first number of irq block assigned to the domain
200 * @first_hwirq: first hwirq number to use for the translation. Should normally
201 * be '0', but a positive integer can be used if the effective
202 * hwirqs numbering does not begin at zero.
203 * @ops: map/unmap domain callbacks
204 * @host_data: Controller private data pointer
205 *
206 * Note: the map() callback will be called before this function returns
207 * for all legacy interrupts except 0 (which is always the invalid irq for
208 * a legacy controller).
209 */
210struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
211 unsigned int size,
212 unsigned int first_irq,
213 irq_hw_number_t first_hwirq,
214 const struct irq_domain_ops *ops,
215 void *host_data)
216{
217 struct irq_domain *domain;
218 unsigned int i;
219
220 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
221 if (!domain)
222 return NULL;
223
224 domain->revmap_data.legacy.first_irq = first_irq;
225 domain->revmap_data.legacy.first_hwirq = first_hwirq;
226 domain->revmap_data.legacy.size = size;
227
228 mutex_lock(&irq_domain_mutex);
229 /* Verify that all the irqs are available */
230 for (i = 0; i < size; i++) {
231 int irq = first_irq + i;
232 struct irq_data *irq_data = irq_get_irq_data(irq);
233
234 if (WARN_ON(!irq_data || irq_data->domain)) {
235 mutex_unlock(&irq_domain_mutex);
236 irq_domain_free(domain);
237 return NULL;
238 } 40 }
41 d->domain = domain;
42 d->hwirq = hwirq;
239 } 43 }
240 44
241 /* Claim all of the irqs before registering a legacy domain */ 45 mutex_lock(&irq_domain_mutex);
242 for (i = 0; i < size; i++) { 46 list_add(&domain->list, &irq_domain_list);
243 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
244 irq_data->hwirq = first_hwirq + i;
245 irq_data->domain = domain;
246 }
247 mutex_unlock(&irq_domain_mutex); 47 mutex_unlock(&irq_domain_mutex);
248
249 for (i = 0; i < size; i++) {
250 int irq = first_irq + i;
251 int hwirq = first_hwirq + i;
252
253 /* IRQ0 gets ignored */
254 if (!irq)
255 continue;
256
257 /* Legacy flags are left to default at this point,
258 * one can then use irq_create_mapping() to
259 * explicitly change them
260 */
261 if (ops->map)
262 ops->map(domain, irq, hwirq);
263
264 /* Clear norequest flags */
265 irq_clear_status_flags(irq, IRQ_NOREQUEST);
266 }
267
268 irq_domain_add(domain);
269 return domain;
270}
271EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
272
273/**
274 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
275 * @of_node: pointer to interrupt controller's device tree node.
276 * @size: Number of interrupts in the domain.
277 * @ops: map/unmap domain callbacks
278 * @host_data: Controller private data pointer
279 */
280struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
281 unsigned int size,
282 const struct irq_domain_ops *ops,
283 void *host_data)
284{
285 struct irq_domain *domain;
286 unsigned int *revmap;
287
288 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
289 of_node_to_nid(of_node));
290 if (WARN_ON(!revmap))
291 return NULL;
292
293 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
294 if (!domain) {
295 kfree(revmap);
296 return NULL;
297 }
298 domain->revmap_data.linear.size = size;
299 domain->revmap_data.linear.revmap = revmap;
300 irq_domain_add(domain);
301 return domain;
302} 48}
303EXPORT_SYMBOL_GPL(irq_domain_add_linear);
304
305struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
306 unsigned int max_irq,
307 const struct irq_domain_ops *ops,
308 void *host_data)
309{
310 struct irq_domain *domain = irq_domain_alloc(of_node,
311 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
312 if (domain) {
313 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
314 irq_domain_add(domain);
315 }
316 return domain;
317}
318EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
319 49
320/** 50/**
321 * irq_domain_add_tree() 51 * irq_domain_del() - Unregister an irq_domain
322 * @of_node: pointer to interrupt controller's device tree node. 52 * @domain: ptr to registered irq_domain.
323 * @ops: map/unmap domain callbacks
324 *
325 * Note: The radix tree will be allocated later during boot automatically
326 * (the reverse mapping will use the slow path until that happens).
327 */ 53 */
328struct irq_domain *irq_domain_add_tree(struct device_node *of_node, 54void irq_domain_del(struct irq_domain *domain)
329 const struct irq_domain_ops *ops,
330 void *host_data)
331{ 55{
332 struct irq_domain *domain = irq_domain_alloc(of_node, 56 struct irq_data *d;
333 IRQ_DOMAIN_MAP_TREE, ops, host_data); 57 int hwirq;
334 if (domain) {
335 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
336 irq_domain_add(domain);
337 }
338 return domain;
339}
340EXPORT_SYMBOL_GPL(irq_domain_add_tree);
341 58
342/**
343 * irq_find_host() - Locates a domain for a given device node
344 * @node: device-tree node of the interrupt controller
345 */
346struct irq_domain *irq_find_host(struct device_node *node)
347{
348 struct irq_domain *h, *found = NULL;
349 int rc;
350
351 /* We might want to match the legacy controller last since
352 * it might potentially be set to match all interrupts in
353 * the absence of a device node. This isn't a problem so far
354 * yet though...
355 */
356 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
357 list_for_each_entry(h, &irq_domain_list, link) { 60 list_del(&domain->list);
358 if (h->ops->match)
359 rc = h->ops->match(h, node);
360 else
361 rc = (h->of_node != NULL) && (h->of_node == node);
362
363 if (rc) {
364 found = h;
365 break;
366 }
367 }
368 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
369 return found;
370}
371EXPORT_SYMBOL_GPL(irq_find_host);
372
373/**
374 * irq_set_default_host() - Set a "default" irq domain
375 * @domain: default domain pointer
376 *
377 * For convenience, it's possible to set a "default" domain that will be used
378 * whenever NULL is passed to irq_create_mapping(). It makes life easier for
379 * platforms that want to manipulate a few hard coded interrupt numbers that
380 * aren't properly represented in the device-tree.
381 */
382void irq_set_default_host(struct irq_domain *domain)
383{
384 pr_debug("Default domain set to @0x%p\n", domain);
385
386 irq_default_domain = domain;
387}
388EXPORT_SYMBOL_GPL(irq_set_default_host);
389
390static void irq_domain_disassociate_many(struct irq_domain *domain,
391 unsigned int irq_base, int count)
392{
393 /*
394 * disassociate in reverse order;
395 * not strictly necessary, but nice for unwinding
396 */
397 while (count--) {
398 int irq = irq_base + count;
399 struct irq_data *irq_data = irq_get_irq_data(irq);
400 irq_hw_number_t hwirq = irq_data->hwirq;
401
402 if (WARN_ON(!irq_data || irq_data->domain != domain))
403 continue;
404
405 irq_set_status_flags(irq, IRQ_NOREQUEST);
406
407 /* remove chip and handler */
408 irq_set_chip_and_handler(irq, NULL, NULL);
409
410 /* Make sure it's completed */
411 synchronize_irq(irq);
412
413 /* Tell the PIC about it */
414 if (domain->ops->unmap)
415 domain->ops->unmap(domain, irq);
416 smp_mb();
417
418 irq_data->domain = NULL;
419 irq_data->hwirq = 0;
420
421 /* Clear reverse map */
422 switch(domain->revmap_type) {
423 case IRQ_DOMAIN_MAP_LINEAR:
424 if (hwirq < domain->revmap_data.linear.size)
425 domain->revmap_data.linear.revmap[hwirq] = 0;
426 break;
427 case IRQ_DOMAIN_MAP_TREE:
428 mutex_lock(&revmap_trees_mutex);
429 radix_tree_delete(&domain->revmap_data.tree, hwirq);
430 mutex_unlock(&revmap_trees_mutex);
431 break;
432 }
433 }
434}
435
436int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
437 irq_hw_number_t hwirq_base, int count)
438{
439 unsigned int virq = irq_base;
440 irq_hw_number_t hwirq = hwirq_base;
441 int i, ret;
442
443 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
444 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
445
446 for (i = 0; i < count; i++) {
447 struct irq_data *irq_data = irq_get_irq_data(virq + i);
448
449 if (WARN(!irq_data, "error: irq_desc not allocated; "
450 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
451 return -EINVAL;
452 if (WARN(irq_data->domain, "error: irq_desc already associated; "
453 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
454 return -EINVAL;
455 };
456
457 for (i = 0; i < count; i++, virq++, hwirq++) {
458 struct irq_data *irq_data = irq_get_irq_data(virq);
459
460 irq_data->hwirq = hwirq;
461 irq_data->domain = domain;
462 if (domain->ops->map) {
463 ret = domain->ops->map(domain, virq, hwirq);
464 if (ret != 0) {
465 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
466 virq, hwirq, ret);
467 WARN_ON(1);
468 irq_data->domain = NULL;
469 irq_data->hwirq = 0;
470 goto err_unmap;
471 }
472 }
473
474 switch (domain->revmap_type) {
475 case IRQ_DOMAIN_MAP_LINEAR:
476 if (hwirq < domain->revmap_data.linear.size)
477 domain->revmap_data.linear.revmap[hwirq] = virq;
478 break;
479 case IRQ_DOMAIN_MAP_TREE:
480 mutex_lock(&revmap_trees_mutex);
481 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
482 mutex_unlock(&revmap_trees_mutex);
483 break;
484 }
485
486 irq_clear_status_flags(virq, IRQ_NOREQUEST);
487 }
488
489 return 0;
490
491 err_unmap:
492 irq_domain_disassociate_many(domain, irq_base, i);
493 return -EINVAL;
494}
495EXPORT_SYMBOL_GPL(irq_domain_associate_many);
496
497/**
498 * irq_create_direct_mapping() - Allocate an irq for direct mapping
499 * @domain: domain to allocate the irq for or NULL for default domain
500 *
501 * This routine is used for irq controllers which can choose the hardware
502 * interrupt numbers they generate. In such a case it's simplest to use
503 * the linux irq as the hardware interrupt number.
504 */
505unsigned int irq_create_direct_mapping(struct irq_domain *domain)
506{
507 unsigned int virq;
508
509 if (domain == NULL)
510 domain = irq_default_domain;
511
512 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
513 return 0;
514
515 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
516 if (!virq) {
517 pr_debug("create_direct virq allocation failed\n");
518 return 0;
519 }
520 if (virq >= domain->revmap_data.nomap.max_irq) {
521 pr_err("ERROR: no free irqs available below %i maximum\n",
522 domain->revmap_data.nomap.max_irq);
523 irq_free_desc(virq);
524 return 0;
525 }
526 pr_debug("create_direct obtained virq %d\n", virq);
527 62
528 if (irq_domain_associate(domain, virq, virq)) { 63 /* Clear the irq_domain assignments */
529 irq_free_desc(virq); 64 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
530 return 0; 65 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
66 d->domain = NULL;
531 } 67 }
532
533 return virq;
534}
535EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
536
537/**
538 * irq_create_mapping() - Map a hardware interrupt into linux irq space
539 * @domain: domain owning this hardware interrupt or NULL for default domain
540 * @hwirq: hardware irq number in that domain space
541 *
542 * Only one mapping per hardware interrupt is permitted. Returns a linux
543 * irq number.
544 * If the sense/trigger is to be specified, set_irq_type() should be called
545 * on the number returned from that call.
546 */
547unsigned int irq_create_mapping(struct irq_domain *domain,
548 irq_hw_number_t hwirq)
549{
550 unsigned int hint;
551 int virq;
552
553 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
554
555 /* Look for default domain if nececssary */
556 if (domain == NULL)
557 domain = irq_default_domain;
558 if (domain == NULL) {
559 pr_warning("irq_create_mapping called for"
560 " NULL domain, hwirq=%lx\n", hwirq);
561 WARN_ON(1);
562 return 0;
563 }
564 pr_debug("-> using domain @%p\n", domain);
565
566 /* Check if mapping already exists */
567 virq = irq_find_mapping(domain, hwirq);
568 if (virq) {
569 pr_debug("-> existing mapping on virq %d\n", virq);
570 return virq;
571 }
572
573 /* Get a virtual interrupt number */
574 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
575 return irq_domain_legacy_revmap(domain, hwirq);
576
577 /* Allocate a virtual interrupt number */
578 hint = hwirq % nr_irqs;
579 if (hint == 0)
580 hint++;
581 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
582 if (virq <= 0)
583 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
584 if (virq <= 0) {
585 pr_debug("-> virq allocation failed\n");
586 return 0;
587 }
588
589 if (irq_domain_associate(domain, virq, hwirq)) {
590 irq_free_desc(virq);
591 return 0;
592 }
593
594 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
595 hwirq, of_node_full_name(domain->of_node), virq);
596
597 return virq;
598} 68}
599EXPORT_SYMBOL_GPL(irq_create_mapping);
600 69
70#if defined(CONFIG_OF_IRQ)
601/** 71/**
602 * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs 72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
603 * @domain: domain owning the interrupt range
604 * @irq_base: beginning of linux IRQ range
605 * @hwirq_base: beginning of hardware IRQ range
606 * @count: Number of interrupts to map
607 * 73 *
608 * This routine is used for allocating and mapping a range of hardware 74 * Used by the device tree interrupt mapping code to translate a device tree
609 * irqs to linux irqs where the linux irq numbers are at pre-defined 75 * interrupt specifier to a valid linux irq number. Returns either a valid
610 * locations. For use by controllers that already have static mappings 76 * linux IRQ number or 0.
611 * to insert in to the domain.
612 * 77 *
613 * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time 78 * When the caller no longer need the irq number returned by this function it
614 * domain insertion. 79 * should arrange to call irq_dispose_mapping().
615 *
616 * 0 is returned upon success, while any failure to establish a static
617 * mapping is treated as an error.
618 */ 80 */
619int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
620 irq_hw_number_t hwirq_base, int count)
621{
622 int ret;
623
624 ret = irq_alloc_descs(irq_base, irq_base, count,
625 of_node_to_nid(domain->of_node));
626 if (unlikely(ret < 0))
627 return ret;
628
629 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
630 if (unlikely(ret < 0)) {
631 irq_free_descs(irq_base, count);
632 return ret;
633 }
634
635 return 0;
636}
637EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
638
639unsigned int irq_create_of_mapping(struct device_node *controller, 81unsigned int irq_create_of_mapping(struct device_node *controller,
640 const u32 *intspec, unsigned int intsize) 82 const u32 *intspec, unsigned int intsize)
641{ 83{
642 struct irq_domain *domain; 84 struct irq_domain *domain;
643 irq_hw_number_t hwirq; 85 unsigned long hwirq;
644 unsigned int type = IRQ_TYPE_NONE; 86 unsigned int irq, type;
645 unsigned int virq; 87 int rc = -EINVAL;
646 88
647 domain = controller ? irq_find_host(controller) : irq_default_domain; 89 /* Find a domain which can translate the irq spec */
648 if (!domain) { 90 mutex_lock(&irq_domain_mutex);
649#ifdef CONFIG_MIPS 91 list_for_each_entry(domain, &irq_domain_list, list) {
650 /* 92 if (!domain->ops->dt_translate)
651 * Workaround to avoid breaking interrupt controller drivers 93 continue;
652 * that don't yet register an irq_domain. This is temporary 94 rc = domain->ops->dt_translate(domain, controller,
653 * code. ~~~gcl, Feb 24, 2012 95 intspec, intsize, &hwirq, &type);
654 * 96 if (rc == 0)
655 * Scheduled for removal in Linux v3.6. That should be enough 97 break;
656 * time.
657 */
658 if (intsize > 0)
659 return intspec[0];
660#endif
661 pr_warning("no irq domain found for %s !\n",
662 of_node_full_name(controller));
663 return 0;
664 }
665
666 /* If domain has no translation, then we assume interrupt line */
667 if (domain->ops->xlate == NULL)
668 hwirq = intspec[0];
669 else {
670 if (domain->ops->xlate(domain, controller, intspec, intsize,
671 &hwirq, &type))
672 return 0;
673 } 98 }
99 mutex_unlock(&irq_domain_mutex);
674 100
675 /* Create mapping */ 101 if (rc != 0)
676 virq = irq_create_mapping(domain, hwirq);
677 if (!virq)
678 return virq;
679
680 /* Set type if specified and different than the current one */
681 if (type != IRQ_TYPE_NONE &&
682 type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
683 irq_set_irq_type(virq, type);
684 return virq;
685}
686EXPORT_SYMBOL_GPL(irq_create_of_mapping);
687
688/**
689 * irq_dispose_mapping() - Unmap an interrupt
690 * @virq: linux irq number of the interrupt to unmap
691 */
692void irq_dispose_mapping(unsigned int virq)
693{
694 struct irq_data *irq_data = irq_get_irq_data(virq);
695 struct irq_domain *domain;
696
697 if (!virq || !irq_data)
698 return;
699
700 domain = irq_data->domain;
701 if (WARN_ON(domain == NULL))
702 return;
703
704 /* Never unmap legacy interrupts */
705 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
706 return;
707
708 irq_domain_disassociate_many(domain, virq, 1);
709 irq_free_desc(virq);
710}
711EXPORT_SYMBOL_GPL(irq_dispose_mapping);
712
713/**
714 * irq_find_mapping() - Find a linux irq from an hw irq number.
715 * @domain: domain owning this hardware interrupt
716 * @hwirq: hardware irq number in that domain space
717 */
718unsigned int irq_find_mapping(struct irq_domain *domain,
719 irq_hw_number_t hwirq)
720{
721 struct irq_data *data;
722
723 /* Look for default domain if nececssary */
724 if (domain == NULL)
725 domain = irq_default_domain;
726 if (domain == NULL)
727 return 0; 102 return 0;
728 103
729 switch (domain->revmap_type) { 104 irq = irq_domain_to_irq(domain, hwirq);
730 case IRQ_DOMAIN_MAP_LEGACY: 105 if (type != IRQ_TYPE_NONE)
731 return irq_domain_legacy_revmap(domain, hwirq); 106 irq_set_irq_type(irq, type);
732 case IRQ_DOMAIN_MAP_LINEAR: 107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
733 return irq_linear_revmap(domain, hwirq); 108 controller->full_name, (int)hwirq, irq, type);
734 case IRQ_DOMAIN_MAP_TREE: 109 return irq;
735 rcu_read_lock();
736 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
737 rcu_read_unlock();
738 if (data)
739 return data->irq;
740 break;
741 case IRQ_DOMAIN_MAP_NOMAP:
742 data = irq_get_irq_data(hwirq);
743 if (data && (data->domain == domain) && (data->hwirq == hwirq))
744 return hwirq;
745 break;
746 }
747
748 return 0;
749} 110}
750EXPORT_SYMBOL_GPL(irq_find_mapping); 111EXPORT_SYMBOL_GPL(irq_create_of_mapping);
751 112
752/** 113/**
753 * irq_linear_revmap() - Find a linux irq from a hw irq number. 114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
754 * @domain: domain owning this hardware interrupt 115 * @irq: linux irq number to be discarded
755 * @hwirq: hardware irq number in that domain space
756 * 116 *
757 * This is a fast path that can be called directly by irq controller code to 117 * Calling this function indicates the caller no longer needs a reference to
758 * save a handful of instructions. 118 * the linux irq number returned by a prior call to irq_create_of_mapping().
759 */ 119 */
760unsigned int irq_linear_revmap(struct irq_domain *domain, 120void irq_dispose_mapping(unsigned int irq)
761 irq_hw_number_t hwirq)
762{ 121{
763 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); 122 /*
764 123 * nothing yet; will be filled when support for dynamic allocation of
765 /* Check revmap bounds; complain if exceeded */ 124 * irq_descs is added to irq_domain
766 if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) 125 */
767 return 0;
768
769 return domain->revmap_data.linear.revmap[hwirq];
770} 126}
771EXPORT_SYMBOL_GPL(irq_linear_revmap); 127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
772 128
773#ifdef CONFIG_IRQ_DOMAIN_DEBUG 129int irq_domain_simple_dt_translate(struct irq_domain *d,
774static int virq_debug_show(struct seq_file *m, void *private) 130 struct device_node *controller,
131 const u32 *intspec, unsigned int intsize,
132 unsigned long *out_hwirq, unsigned int *out_type)
775{ 133{
776 unsigned long flags; 134 if (d->of_node != controller)
777 struct irq_desc *desc; 135 return -EINVAL;
778 const char *p; 136 if (intsize < 1)
779 static const char none[] = "none"; 137 return -EINVAL;
780 void *data;
781 int i;
782
783 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq",
784 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
785 "domain name");
786
787 for (i = 1; i < nr_irqs; i++) {
788 desc = irq_to_desc(i);
789 if (!desc)
790 continue;
791
792 raw_spin_lock_irqsave(&desc->lock, flags);
793
794 if (desc->action && desc->action->handler) {
795 struct irq_chip *chip;
796
797 seq_printf(m, "%5d ", i);
798 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq);
799
800 chip = irq_desc_get_chip(desc);
801 if (chip && chip->name)
802 p = chip->name;
803 else
804 p = none;
805 seq_printf(m, "%-15s ", p);
806
807 data = irq_desc_get_chip_data(desc);
808 seq_printf(m, data ? "0x%p " : " %p ", data);
809
810 if (desc->irq_data.domain)
811 p = of_node_full_name(desc->irq_data.domain->of_node);
812 else
813 p = none;
814 seq_printf(m, "%s\n", p);
815 }
816
817 raw_spin_unlock_irqrestore(&desc->lock, flags);
818 }
819 138
139 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE;
141 if (intsize > 1)
142 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
820 return 0; 143 return 0;
821} 144}
822 145
823static int virq_debug_open(struct inode *inode, struct file *file) 146struct irq_domain_ops irq_domain_simple_ops = {
824{ 147 .dt_translate = irq_domain_simple_dt_translate,
825 return single_open(file, virq_debug_show, inode->i_private);
826}
827
828static const struct file_operations virq_debug_fops = {
829 .open = virq_debug_open,
830 .read = seq_read,
831 .llseek = seq_lseek,
832 .release = single_release,
833}; 148};
834 149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
835static int __init irq_debugfs_init(void)
836{
837 if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
838 NULL, &virq_debug_fops) == NULL)
839 return -ENOMEM;
840
841 return 0;
842}
843__initcall(irq_debugfs_init);
844#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
845 150
846/** 151/**
847 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings 152 * irq_domain_create_simple() - Set up a 'simple' translation range
848 *
849 * Device Tree IRQ specifier translation function which works with one cell
850 * bindings where the cell value maps directly to the hwirq number.
851 */ 153 */
852int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, 154void irq_domain_add_simple(struct device_node *controller, int irq_base)
853 const u32 *intspec, unsigned int intsize,
854 unsigned long *out_hwirq, unsigned int *out_type)
855{ 155{
856 if (WARN_ON(intsize < 1)) 156 struct irq_domain *domain;
857 return -EINVAL;
858 *out_hwirq = intspec[0];
859 *out_type = IRQ_TYPE_NONE;
860 return 0;
861}
862EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
863 157
864/** 158 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
865 * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings 159 if (!domain) {
866 * 160 WARN_ON(1);
867 * Device Tree IRQ specifier translation function which works with two cell 161 return;
868 * bindings where the cell values map directly to the hwirq number 162 }
869 * and linux irq flags.
870 */
871int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
872 const u32 *intspec, unsigned int intsize,
873 irq_hw_number_t *out_hwirq, unsigned int *out_type)
874{
875 if (WARN_ON(intsize < 2))
876 return -EINVAL;
877 *out_hwirq = intspec[0];
878 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
879 return 0;
880}
881EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
882 163
883/** 164 domain->irq_base = irq_base;
884 * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings 165 domain->of_node = of_node_get(controller);
885 * 166 domain->ops = &irq_domain_simple_ops;
886 * Device Tree IRQ specifier translation function which works with either one 167 irq_domain_add(domain);
887 * or two cell bindings where the cell values map directly to the hwirq number
888 * and linux irq flags.
889 *
890 * Note: don't use this function unless your interrupt controller explicitly
891 * supports both one and two cell bindings. For the majority of controllers
892 * the _onecell() or _twocell() variants above should be used.
893 */
894int irq_domain_xlate_onetwocell(struct irq_domain *d,
895 struct device_node *ctrlr,
896 const u32 *intspec, unsigned int intsize,
897 unsigned long *out_hwirq, unsigned int *out_type)
898{
899 if (WARN_ON(intsize < 1))
900 return -EINVAL;
901 *out_hwirq = intspec[0];
902 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
903 return 0;
904} 168}
905EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); 169EXPORT_SYMBOL_GPL(irq_domain_add_simple);
906
907const struct irq_domain_ops irq_domain_simple_ops = {
908 .xlate = irq_domain_xlate_onetwocell,
909};
910EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
911 170
912#ifdef CONFIG_OF_IRQ
913void irq_domain_generate_simple(const struct of_device_id *match, 171void irq_domain_generate_simple(const struct of_device_id *match,
914 u64 phys_base, unsigned int irq_start) 172 u64 phys_base, unsigned int irq_start)
915{ 173{
916 struct device_node *node; 174 struct device_node *node;
917 pr_debug("looking for phys_base=%llx, irq_start=%i\n", 175 pr_info("looking for phys_base=%llx, irq_start=%i\n",
918 (unsigned long long) phys_base, (int) irq_start); 176 (unsigned long long) phys_base, (int) irq_start);
919 node = of_find_matching_node_by_address(NULL, match, phys_base); 177 node = of_find_matching_node_by_address(NULL, match, phys_base);
920 if (node) 178 if (node)
921 irq_domain_add_legacy(node, 32, irq_start, 0, 179 irq_domain_add_simple(node, irq_start);
922 &irq_domain_simple_ops, NULL); 180 else
181 pr_info("no node found\n");
923} 182}
924EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 183EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
925#endif 184#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa47..d6c4adc2804 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,8 +7,6 @@
7 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
8 */ 8 */
9 9
10#define pr_fmt(fmt) "genirq: " fmt
11
12#include <linux/irq.h> 10#include <linux/irq.h>
13#include <linux/kthread.h> 11#include <linux/kthread.h>
14#include <linux/module.h> 12#include <linux/module.h>
@@ -16,7 +14,6 @@
16#include <linux/interrupt.h> 14#include <linux/interrupt.h>
17#include <linux/slab.h> 15#include <linux/slab.h>
18#include <linux/sched.h> 16#include <linux/sched.h>
19#include <linux/task_work.h>
20 17
21#include "internals.h" 18#include "internals.h"
22 19
@@ -142,25 +139,6 @@ static inline void
142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } 139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
143#endif 140#endif
144 141
145int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
146 bool force)
147{
148 struct irq_desc *desc = irq_data_to_desc(data);
149 struct irq_chip *chip = irq_data_get_irq_chip(data);
150 int ret;
151
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160
161 return ret;
162}
163
164int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) 142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
165{ 143{
166 struct irq_chip *chip = irq_data_get_irq_chip(data); 144 struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -171,7 +149,14 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
171 return -EINVAL; 149 return -EINVAL;
172 150
173 if (irq_can_move_pcntxt(data)) { 151 if (irq_can_move_pcntxt(data)) {
174 ret = irq_do_set_affinity(data, mask, false); 152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
175 } else { 160 } else {
176 irqd_set_move_pending(data); 161 irqd_set_move_pending(data);
177 irq_copy_pending(desc, mask); 162 irq_copy_pending(desc, mask);
@@ -210,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
210int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
211{ 196{
212 unsigned long flags; 197 unsigned long flags;
213 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
214 199
215 if (!desc) 200 if (!desc)
216 return -EINVAL; 201 return -EINVAL;
@@ -295,8 +280,9 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
295static int 280static int
296setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) 281setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
297{ 282{
283 struct irq_chip *chip = irq_desc_get_chip(desc);
298 struct cpumask *set = irq_default_affinity; 284 struct cpumask *set = irq_default_affinity;
299 int node = desc->irq_data.node; 285 int ret;
300 286
301 /* Excludes PER_CPU and NO_BALANCE interrupts */ 287 /* Excludes PER_CPU and NO_BALANCE interrupts */
302 if (!irq_can_set_affinity(irq)) 288 if (!irq_can_set_affinity(irq))
@@ -315,14 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
315 } 301 }
316 302
317 cpumask_and(mask, cpu_online_mask, set); 303 cpumask_and(mask, cpu_online_mask, set);
318 if (node != NUMA_NO_NODE) { 304 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
319 const struct cpumask *nodemask = cpumask_of_node(node); 305 switch (ret) {
320 306 case IRQ_SET_MASK_OK:
321 /* make sure at least one of the cpus in nodemask is online */ 307 cpumask_copy(desc->irq_data.affinity, mask);
322 if (cpumask_intersects(mask, nodemask)) 308 case IRQ_SET_MASK_OK_NOCOPY:
323 cpumask_and(mask, mask, nodemask); 309 irq_set_thread_affinity(desc);
324 } 310 }
325 irq_do_set_affinity(&desc->irq_data, mask, false);
326 return 0; 311 return 0;
327} 312}
328#else 313#else
@@ -371,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
371static int __disable_irq_nosync(unsigned int irq) 356static int __disable_irq_nosync(unsigned int irq)
372{ 357{
373 unsigned long flags; 358 unsigned long flags;
374 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
375 360
376 if (!desc) 361 if (!desc)
377 return -EINVAL; 362 return -EINVAL;
@@ -463,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
463void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
464{ 449{
465 unsigned long flags; 450 unsigned long flags;
466 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
467 452
468 if (!desc) 453 if (!desc)
469 return; 454 return;
@@ -482,9 +467,6 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
482 struct irq_desc *desc = irq_to_desc(irq); 467 struct irq_desc *desc = irq_to_desc(irq);
483 int ret = -ENXIO; 468 int ret = -ENXIO;
484 469
485 if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE)
486 return 0;
487
488 if (desc->irq_data.chip->irq_set_wake) 470 if (desc->irq_data.chip->irq_set_wake)
489 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); 471 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
490 472
@@ -506,7 +488,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
506int irq_set_irq_wake(unsigned int irq, unsigned int on) 488int irq_set_irq_wake(unsigned int irq, unsigned int on)
507{ 489{
508 unsigned long flags; 490 unsigned long flags;
509 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
510 int ret = 0; 492 int ret = 0;
511 493
512 if (!desc) 494 if (!desc)
@@ -547,7 +529,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
547int can_request_irq(unsigned int irq, unsigned long irqflags) 529int can_request_irq(unsigned int irq, unsigned long irqflags)
548{ 530{
549 unsigned long flags; 531 unsigned long flags;
550 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); 532 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
551 int canrequest = 0; 533 int canrequest = 0;
552 534
553 if (!desc) 535 if (!desc)
@@ -574,7 +556,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
574 * flow-types? 556 * flow-types?
575 */ 557 */
576 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 558 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
577 chip ? (chip->name ? : "unknown") : "unknown"); 559 chip ? (chip->name ? : "unknown") : "unknown");
578 return 0; 560 return 0;
579 } 561 }
580 562
@@ -608,7 +590,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
608 ret = 0; 590 ret = 0;
609 break; 591 break;
610 default: 592 default:
611 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", 593 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
612 flags, irq, chip->irq_set_type); 594 flags, irq, chip->irq_set_type);
613 } 595 }
614 if (unmask) 596 if (unmask)
@@ -616,22 +598,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
616 return ret; 598 return ret;
617} 599}
618 600
619#ifdef CONFIG_HARDIRQS_SW_RESEND
620int irq_set_parent(int irq, int parent_irq)
621{
622 unsigned long flags;
623 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
624
625 if (!desc)
626 return -EINVAL;
627
628 desc->parent_irq = parent_irq;
629
630 irq_put_desc_unlock(desc, flags);
631 return 0;
632}
633#endif
634
635/* 601/*
636 * Default primary interrupt handler for threaded interrupts. Is 602 * Default primary interrupt handler for threaded interrupts. Is
637 * assigned as primary handler when request_threaded_irq is called 603 * assigned as primary handler when request_threaded_irq is called
@@ -676,7 +642,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
676 * is marked MASKED. 642 * is marked MASKED.
677 */ 643 */
678static void irq_finalize_oneshot(struct irq_desc *desc, 644static void irq_finalize_oneshot(struct irq_desc *desc,
679 struct irqaction *action) 645 struct irqaction *action, bool force)
680{ 646{
681 if (!(desc->istate & IRQS_ONESHOT)) 647 if (!(desc->istate & IRQS_ONESHOT))
682 return; 648 return;
@@ -710,7 +676,7 @@ again:
710 * we would clear the threads_oneshot bit of this thread which 676 * we would clear the threads_oneshot bit of this thread which
711 * was just set. 677 * was just set.
712 */ 678 */
713 if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) 679 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
714 goto out_unlock; 680 goto out_unlock;
715 681
716 desc->threads_oneshot &= ~action->thread_mask; 682 desc->threads_oneshot &= ~action->thread_mask;
@@ -732,7 +698,6 @@ static void
732irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 698irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
733{ 699{
734 cpumask_var_t mask; 700 cpumask_var_t mask;
735 bool valid = true;
736 701
737 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) 702 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
738 return; 703 return;
@@ -747,18 +712,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
747 } 712 }
748 713
749 raw_spin_lock_irq(&desc->lock); 714 raw_spin_lock_irq(&desc->lock);
750 /* 715 cpumask_copy(mask, desc->irq_data.affinity);
751 * This code is triggered unconditionally. Check the affinity
752 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
753 */
754 if (desc->irq_data.affinity)
755 cpumask_copy(mask, desc->irq_data.affinity);
756 else
757 valid = false;
758 raw_spin_unlock_irq(&desc->lock); 716 raw_spin_unlock_irq(&desc->lock);
759 717
760 if (valid) 718 set_cpus_allowed_ptr(current, mask);
761 set_cpus_allowed_ptr(current, mask);
762 free_cpumask_var(mask); 719 free_cpumask_var(mask);
763} 720}
764#else 721#else
@@ -779,7 +736,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
779 736
780 local_bh_disable(); 737 local_bh_disable();
781 ret = action->thread_fn(action->irq, action->dev_id); 738 ret = action->thread_fn(action->irq, action->dev_id);
782 irq_finalize_oneshot(desc, action); 739 irq_finalize_oneshot(desc, action, false);
783 local_bh_enable(); 740 local_bh_enable();
784 return ret; 741 return ret;
785} 742}
@@ -795,50 +752,15 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
795 irqreturn_t ret; 752 irqreturn_t ret;
796 753
797 ret = action->thread_fn(action->irq, action->dev_id); 754 ret = action->thread_fn(action->irq, action->dev_id);
798 irq_finalize_oneshot(desc, action); 755 irq_finalize_oneshot(desc, action, false);
799 return ret; 756 return ret;
800} 757}
801 758
802static void wake_threads_waitq(struct irq_desc *desc)
803{
804 if (atomic_dec_and_test(&desc->threads_active) &&
805 waitqueue_active(&desc->wait_for_threads))
806 wake_up(&desc->wait_for_threads);
807}
808
809static void irq_thread_dtor(struct callback_head *unused)
810{
811 struct task_struct *tsk = current;
812 struct irq_desc *desc;
813 struct irqaction *action;
814
815 if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
816 return;
817
818 action = kthread_data(tsk);
819
820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
821 tsk->comm, tsk->pid, action->irq);
822
823
824 desc = irq_to_desc(action->irq);
825 /*
826 * If IRQTF_RUNTHREAD is set, we need to decrement
827 * desc->threads_active and wake possible waiters.
828 */
829 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
830 wake_threads_waitq(desc);
831
832 /* Prevent a stale desc->threads_oneshot */
833 irq_finalize_oneshot(desc, action);
834}
835
836/* 759/*
837 * Interrupt handler thread 760 * Interrupt handler thread
838 */ 761 */
839static int irq_thread(void *data) 762static int irq_thread(void *data)
840{ 763{
841 struct callback_head on_exit_work;
842 static const struct sched_param param = { 764 static const struct sched_param param = {
843 .sched_priority = MAX_USER_RT_PRIO/2, 765 .sched_priority = MAX_USER_RT_PRIO/2,
844 }; 766 };
@@ -846,45 +768,90 @@ static int irq_thread(void *data)
846 struct irq_desc *desc = irq_to_desc(action->irq); 768 struct irq_desc *desc = irq_to_desc(action->irq);
847 irqreturn_t (*handler_fn)(struct irq_desc *desc, 769 irqreturn_t (*handler_fn)(struct irq_desc *desc,
848 struct irqaction *action); 770 struct irqaction *action);
771 int wake;
849 772
850 if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, 773 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
851 &action->thread_flags)) 774 &action->thread_flags))
852 handler_fn = irq_forced_thread_fn; 775 handler_fn = irq_forced_thread_fn;
853 else 776 else
854 handler_fn = irq_thread_fn; 777 handler_fn = irq_thread_fn;
855 778
856 sched_setscheduler(current, SCHED_FIFO, &param); 779 sched_setscheduler(current, SCHED_FIFO, &param);
857 780 current->irqaction = action;
858 init_task_work(&on_exit_work, irq_thread_dtor);
859 task_work_add(current, &on_exit_work, false);
860
861 irq_thread_check_affinity(desc, action);
862 781
863 while (!irq_wait_for_interrupt(action)) { 782 while (!irq_wait_for_interrupt(action)) {
864 irqreturn_t action_ret;
865 783
866 irq_thread_check_affinity(desc, action); 784 irq_thread_check_affinity(desc, action);
867 785
868 action_ret = handler_fn(desc, action); 786 atomic_inc(&desc->threads_active);
869 if (!noirqdebug)
870 note_interrupt(action->irq, desc, action_ret);
871 787
872 wake_threads_waitq(desc); 788 raw_spin_lock_irq(&desc->lock);
789 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
790 /*
791 * CHECKME: We might need a dedicated
792 * IRQ_THREAD_PENDING flag here, which
793 * retriggers the thread in check_irq_resend()
794 * but AFAICT IRQS_PENDING should be fine as it
795 * retriggers the interrupt itself --- tglx
796 */
797 desc->istate |= IRQS_PENDING;
798 raw_spin_unlock_irq(&desc->lock);
799 } else {
800 irqreturn_t action_ret;
801
802 raw_spin_unlock_irq(&desc->lock);
803 action_ret = handler_fn(desc, action);
804 if (!noirqdebug)
805 note_interrupt(action->irq, desc, action_ret);
806 }
807
808 wake = atomic_dec_and_test(&desc->threads_active);
809
810 if (wake && waitqueue_active(&desc->wait_for_threads))
811 wake_up(&desc->wait_for_threads);
873 } 812 }
874 813
814 /* Prevent a stale desc->threads_oneshot */
815 irq_finalize_oneshot(desc, action, true);
816
875 /* 817 /*
876 * This is the regular exit path. __free_irq() is stopping the 818 * Clear irqaction. Otherwise exit_irq_thread() would make
877 * thread via kthread_stop() after calling 819 * fuzz about an active irq thread going into nirvana.
878 * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
879 * oneshot mask bit can be set. We cannot verify that as we
880 * cannot touch the oneshot mask at this point anymore as
881 * __setup_irq() might have given out currents thread_mask
882 * again.
883 */ 820 */
884 task_work_cancel(current, irq_thread_dtor); 821 current->irqaction = NULL;
885 return 0; 822 return 0;
886} 823}
887 824
825/*
826 * Called from do_exit()
827 */
828void exit_irq_thread(void)
829{
830 struct task_struct *tsk = current;
831 struct irq_desc *desc;
832
833 if (!tsk->irqaction)
834 return;
835
836 printk(KERN_ERR
837 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
838 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
839
840 desc = irq_to_desc(tsk->irqaction->irq);
841
842 /*
843 * Prevent a stale desc->threads_oneshot. Must be called
844 * before setting the IRQTF_DIED flag.
845 */
846 irq_finalize_oneshot(desc, tsk->irqaction, true);
847
848 /*
849 * Set the THREAD DIED flag to prevent further wakeups of the
850 * soon to be gone threaded handler.
851 */
852 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
853}
854
888static void irq_setup_forced_threading(struct irqaction *new) 855static void irq_setup_forced_threading(struct irqaction *new)
889{ 856{
890 if (!force_irqthreads) 857 if (!force_irqthreads)
@@ -909,6 +876,7 @@ static int
909__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 876__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
910{ 877{
911 struct irqaction *old, **old_ptr; 878 struct irqaction *old, **old_ptr;
879 const char *old_name = NULL;
912 unsigned long flags, thread_mask = 0; 880 unsigned long flags, thread_mask = 0;
913 int ret, nested, shared = 0; 881 int ret, nested, shared = 0;
914 cpumask_var_t mask; 882 cpumask_var_t mask;
@@ -920,6 +888,22 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
920 return -ENOSYS; 888 return -ENOSYS;
921 if (!try_module_get(desc->owner)) 889 if (!try_module_get(desc->owner))
922 return -ENODEV; 890 return -ENODEV;
891 /*
892 * Some drivers like serial.c use request_irq() heavily,
893 * so we have to be careful not to interfere with a
894 * running system.
895 */
896 if (new->flags & IRQF_SAMPLE_RANDOM) {
897 /*
898 * This function might sleep, we want to call it first,
899 * outside of the atomic block.
900 * Yes, this might clear the entropy pool if the wrong
901 * driver is attempted to be loaded, without actually
902 * installing a new handler, but is this really a problem,
903 * only the sysadmin is able to do this.
904 */
905 rand_initialize_irq(irq);
906 }
923 907
924 /* 908 /*
925 * Check whether the interrupt nests into another interrupt 909 * Check whether the interrupt nests into another interrupt
@@ -963,16 +947,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
963 */ 947 */
964 get_task_struct(t); 948 get_task_struct(t);
965 new->thread = t; 949 new->thread = t;
966 /*
967 * Tell the thread to set its affinity. This is
968 * important for shared interrupt handlers as we do
969 * not invoke setup_affinity() for the secondary
970 * handlers as everything is already set up. Even for
971 * interrupts marked with IRQF_NO_BALANCE this is
972 * correct as we want the thread to move to the cpu(s)
973 * on which the requesting code placed the interrupt.
974 */
975 set_bit(IRQTF_AFFINITY, &new->thread_flags);
976 } 950 }
977 951
978 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 952 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -981,18 +955,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
981 } 955 }
982 956
983 /* 957 /*
984 * Drivers are often written to work w/o knowledge about the
985 * underlying irq chip implementation, so a request for a
986 * threaded irq without a primary hard irq context handler
987 * requires the ONESHOT flag to be set. Some irq chips like
988 * MSI based interrupts are per se one shot safe. Check the
989 * chip flags, so we can avoid the unmask dance at the end of
990 * the threaded handler for those.
991 */
992 if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
993 new->flags &= ~IRQF_ONESHOT;
994
995 /*
996 * The following block of code has to be executed atomically 958 * The following block of code has to be executed atomically
997 */ 959 */
998 raw_spin_lock_irqsave(&desc->lock, flags); 960 raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1008,8 +970,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1008 */ 970 */
1009 if (!((old->flags & new->flags) & IRQF_SHARED) || 971 if (!((old->flags & new->flags) & IRQF_SHARED) ||
1010 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 972 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
1011 ((old->flags ^ new->flags) & IRQF_ONESHOT)) 973 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
974 old_name = old->name;
1012 goto mismatch; 975 goto mismatch;
976 }
1013 977
1014 /* All handlers must agree on per-cpuness */ 978 /* All handlers must agree on per-cpuness */
1015 if ((old->flags & IRQF_PERCPU) != 979 if ((old->flags & IRQF_PERCPU) !=
@@ -1018,11 +982,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1018 982
1019 /* add new interrupt at end of irq queue */ 983 /* add new interrupt at end of irq queue */
1020 do { 984 do {
1021 /*
1022 * Or all existing action->thread_mask bits,
1023 * so we can find the next zero bit for this
1024 * new action.
1025 */
1026 thread_mask |= old->thread_mask; 985 thread_mask |= old->thread_mask;
1027 old_ptr = &old->next; 986 old_ptr = &old->next;
1028 old = *old_ptr; 987 old = *old_ptr;
@@ -1031,63 +990,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1031 } 990 }
1032 991
1033 /* 992 /*
1034 * Setup the thread mask for this irqaction for ONESHOT. For 993 * Setup the thread mask for this irqaction. Unlikely to have
1035 * !ONESHOT irqs the thread mask is 0 so we can avoid a 994 * 32 resp 64 irqs sharing one line, but who knows.
1036 * conditional in irq_wake_thread().
1037 */ 995 */
1038 if (new->flags & IRQF_ONESHOT) { 996 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
1039 /* 997 ret = -EBUSY;
1040 * Unlikely to have 32 resp 64 irqs sharing one line,
1041 * but who knows.
1042 */
1043 if (thread_mask == ~0UL) {
1044 ret = -EBUSY;
1045 goto out_mask;
1046 }
1047 /*
1048 * The thread_mask for the action is or'ed to
1049 * desc->thread_active to indicate that the
1050 * IRQF_ONESHOT thread handler has been woken, but not
1051 * yet finished. The bit is cleared when a thread
1052 * completes. When all threads of a shared interrupt
1053 * line have completed desc->threads_active becomes
1054 * zero and the interrupt line is unmasked. See
1055 * handle.c:irq_wake_thread() for further information.
1056 *
1057 * If no thread is woken by primary (hard irq context)
1058 * interrupt handlers, then desc->threads_active is
1059 * also checked for zero to unmask the irq line in the
1060 * affected hard irq flow handlers
1061 * (handle_[fasteoi|level]_irq).
1062 *
1063 * The new action gets the first zero bit of
1064 * thread_mask assigned. See the loop above which or's
1065 * all existing action->thread_mask bits.
1066 */
1067 new->thread_mask = 1 << ffz(thread_mask);
1068
1069 } else if (new->handler == irq_default_primary_handler &&
1070 !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
1071 /*
1072 * The interrupt was requested with handler = NULL, so
1073 * we use the default primary handler for it. But it
1074 * does not have the oneshot flag set. In combination
1075 * with level interrupts this is deadly, because the
1076 * default primary handler just wakes the thread, then
1077 * the irq lines is reenabled, but the device still
1078 * has the level irq asserted. Rinse and repeat....
1079 *
1080 * While this works for edge type interrupts, we play
1081 * it safe and reject unconditionally because we can't
1082 * say for sure which type this interrupt really
1083 * has. The type flags are unreliable as the
1084 * underlying chip implementation can override them.
1085 */
1086 pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1087 irq);
1088 ret = -EINVAL;
1089 goto out_mask; 998 goto out_mask;
1090 } 999 }
1000 new->thread_mask = 1 << ffz(thread_mask);
1091 1001
1092 if (!shared) { 1002 if (!shared) {
1093 init_waitqueue_head(&desc->wait_for_threads); 1003 init_waitqueue_head(&desc->wait_for_threads);
@@ -1114,7 +1024,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1114 desc->istate |= IRQS_ONESHOT; 1024 desc->istate |= IRQS_ONESHOT;
1115 1025
1116 if (irq_settings_can_autoenable(desc)) 1026 if (irq_settings_can_autoenable(desc))
1117 irq_startup(desc, true); 1027 irq_startup(desc);
1118 else 1028 else
1119 /* Undo nested disables: */ 1029 /* Undo nested disables: */
1120 desc->depth = 1; 1030 desc->depth = 1;
@@ -1134,7 +1044,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1134 1044
1135 if (nmsk != omsk) 1045 if (nmsk != omsk)
1136 /* hope the handler works with current trigger mode */ 1046 /* hope the handler works with current trigger mode */
1137 pr_warning("irq %d uses trigger mode %u; requested %u\n", 1047 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1138 irq, nmsk, omsk); 1048 irq, nmsk, omsk);
1139 } 1049 }
1140 1050
@@ -1171,13 +1081,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1171 return 0; 1081 return 0;
1172 1082
1173mismatch: 1083mismatch:
1174 if (!(new->flags & IRQF_PROBE_SHARED)) {
1175 pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1176 irq, new->flags, new->name, old->flags, old->name);
1177#ifdef CONFIG_DEBUG_SHIRQ 1084#ifdef CONFIG_DEBUG_SHIRQ
1085 if (!(new->flags & IRQF_PROBE_SHARED)) {
1086 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
1087 if (old_name)
1088 printk(KERN_ERR "current handler: %s\n", old_name);
1178 dump_stack(); 1089 dump_stack();
1179#endif
1180 } 1090 }
1091#endif
1181 ret = -EBUSY; 1092 ret = -EBUSY;
1182 1093
1183out_mask: 1094out_mask:
@@ -1189,7 +1100,8 @@ out_thread:
1189 struct task_struct *t = new->thread; 1100 struct task_struct *t = new->thread;
1190 1101
1191 new->thread = NULL; 1102 new->thread = NULL;
1192 kthread_stop(t); 1103 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
1104 kthread_stop(t);
1193 put_task_struct(t); 1105 put_task_struct(t);
1194 } 1106 }
1195out_mput: 1107out_mput:
@@ -1209,8 +1121,6 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1209 int retval; 1121 int retval;
1210 struct irq_desc *desc = irq_to_desc(irq); 1122 struct irq_desc *desc = irq_to_desc(irq);
1211 1123
1212 if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1213 return -EINVAL;
1214 chip_bus_lock(desc); 1124 chip_bus_lock(desc);
1215 retval = __setup_irq(irq, desc, act); 1125 retval = __setup_irq(irq, desc, act);
1216 chip_bus_sync_unlock(desc); 1126 chip_bus_sync_unlock(desc);
@@ -1219,7 +1129,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1219} 1129}
1220EXPORT_SYMBOL_GPL(setup_irq); 1130EXPORT_SYMBOL_GPL(setup_irq);
1221 1131
1222/* 1132 /*
1223 * Internal function to unregister an irqaction - used to free 1133 * Internal function to unregister an irqaction - used to free
1224 * regular and special interrupts that are part of the architecture. 1134 * regular and special interrupts that are part of the architecture.
1225 */ 1135 */
@@ -1259,6 +1169,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1259 /* Found it - now remove it from the list of entries: */ 1169 /* Found it - now remove it from the list of entries: */
1260 *action_ptr = action->next; 1170 *action_ptr = action->next;
1261 1171
1172 /* Currently used only by UML, might disappear one day: */
1173#ifdef CONFIG_IRQ_RELEASE_METHOD
1174 if (desc->irq_data.chip->release)
1175 desc->irq_data.chip->release(irq, dev_id);
1176#endif
1177
1262 /* If this was the last handler, shut down the IRQ line: */ 1178 /* If this was the last handler, shut down the IRQ line: */
1263 if (!desc->action) 1179 if (!desc->action)
1264 irq_shutdown(desc); 1180 irq_shutdown(desc);
@@ -1293,7 +1209,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1293#endif 1209#endif
1294 1210
1295 if (action->thread) { 1211 if (action->thread) {
1296 kthread_stop(action->thread); 1212 if (!test_bit(IRQTF_DIED, &action->thread_flags))
1213 kthread_stop(action->thread);
1297 put_task_struct(action->thread); 1214 put_task_struct(action->thread);
1298 } 1215 }
1299 1216
@@ -1310,10 +1227,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1310 */ 1227 */
1311void remove_irq(unsigned int irq, struct irqaction *act) 1228void remove_irq(unsigned int irq, struct irqaction *act)
1312{ 1229{
1313 struct irq_desc *desc = irq_to_desc(irq); 1230 __free_irq(irq, act->dev_id);
1314
1315 if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1316 __free_irq(irq, act->dev_id);
1317} 1231}
1318EXPORT_SYMBOL_GPL(remove_irq); 1232EXPORT_SYMBOL_GPL(remove_irq);
1319 1233
@@ -1335,7 +1249,7 @@ void free_irq(unsigned int irq, void *dev_id)
1335{ 1249{
1336 struct irq_desc *desc = irq_to_desc(irq); 1250 struct irq_desc *desc = irq_to_desc(irq);
1337 1251
1338 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) 1252 if (!desc)
1339 return; 1253 return;
1340 1254
1341#ifdef CONFIG_SMP 1255#ifdef CONFIG_SMP
@@ -1370,7 +1284,7 @@ EXPORT_SYMBOL(free_irq);
1370 * and to set up the interrupt handler in the right order. 1284 * and to set up the interrupt handler in the right order.
1371 * 1285 *
1372 * If you want to set up a threaded irq handler for your device 1286 * If you want to set up a threaded irq handler for your device
1373 * then you need to supply @handler and @thread_fn. @handler is 1287 * then you need to supply @handler and @thread_fn. @handler ist
1374 * still called in hard interrupt context and has to check 1288 * still called in hard interrupt context and has to check
1375 * whether the interrupt originates from the device. If yes it 1289 * whether the interrupt originates from the device. If yes it
1376 * needs to disable the interrupt on the device and return 1290 * needs to disable the interrupt on the device and return
@@ -1388,6 +1302,7 @@ EXPORT_SYMBOL(free_irq);
1388 * Flags: 1302 * Flags:
1389 * 1303 *
1390 * IRQF_SHARED Interrupt is shared 1304 * IRQF_SHARED Interrupt is shared
1305 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1391 * IRQF_TRIGGER_* Specify active edge(s) or level 1306 * IRQF_TRIGGER_* Specify active edge(s) or level
1392 * 1307 *
1393 */ 1308 */
@@ -1412,8 +1327,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1412 if (!desc) 1327 if (!desc)
1413 return -EINVAL; 1328 return -EINVAL;
1414 1329
1415 if (!irq_settings_can_request(desc) || 1330 if (!irq_settings_can_request(desc))
1416 WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1417 return -EINVAL; 1331 return -EINVAL;
1418 1332
1419 if (!handler) { 1333 if (!handler) {
@@ -1498,194 +1412,3 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1498 return !ret ? IRQC_IS_HARDIRQ : ret; 1412 return !ret ? IRQC_IS_HARDIRQ : ret;
1499} 1413}
1500EXPORT_SYMBOL_GPL(request_any_context_irq); 1414EXPORT_SYMBOL_GPL(request_any_context_irq);
1501
1502void enable_percpu_irq(unsigned int irq, unsigned int type)
1503{
1504 unsigned int cpu = smp_processor_id();
1505 unsigned long flags;
1506 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1507
1508 if (!desc)
1509 return;
1510
1511 type &= IRQ_TYPE_SENSE_MASK;
1512 if (type != IRQ_TYPE_NONE) {
1513 int ret;
1514
1515 ret = __irq_set_trigger(desc, irq, type);
1516
1517 if (ret) {
1518 WARN(1, "failed to set type for IRQ%d\n", irq);
1519 goto out;
1520 }
1521 }
1522
1523 irq_percpu_enable(desc, cpu);
1524out:
1525 irq_put_desc_unlock(desc, flags);
1526}
1527
1528void disable_percpu_irq(unsigned int irq)
1529{
1530 unsigned int cpu = smp_processor_id();
1531 unsigned long flags;
1532 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
1533
1534 if (!desc)
1535 return;
1536
1537 irq_percpu_disable(desc, cpu);
1538 irq_put_desc_unlock(desc, flags);
1539}
1540
1541/*
1542 * Internal function to unregister a percpu irqaction.
1543 */
1544static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1545{
1546 struct irq_desc *desc = irq_to_desc(irq);
1547 struct irqaction *action;
1548 unsigned long flags;
1549
1550 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
1551
1552 if (!desc)
1553 return NULL;
1554
1555 raw_spin_lock_irqsave(&desc->lock, flags);
1556
1557 action = desc->action;
1558 if (!action || action->percpu_dev_id != dev_id) {
1559 WARN(1, "Trying to free already-free IRQ %d\n", irq);
1560 goto bad;
1561 }
1562
1563 if (!cpumask_empty(desc->percpu_enabled)) {
1564 WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
1565 irq, cpumask_first(desc->percpu_enabled));
1566 goto bad;
1567 }
1568
1569 /* Found it - now remove it from the list of entries: */
1570 desc->action = NULL;
1571
1572 raw_spin_unlock_irqrestore(&desc->lock, flags);
1573
1574 unregister_handler_proc(irq, action);
1575
1576 module_put(desc->owner);
1577 return action;
1578
1579bad:
1580 raw_spin_unlock_irqrestore(&desc->lock, flags);
1581 return NULL;
1582}
1583
1584/**
1585 * remove_percpu_irq - free a per-cpu interrupt
1586 * @irq: Interrupt line to free
1587 * @act: irqaction for the interrupt
1588 *
1589 * Used to remove interrupts statically setup by the early boot process.
1590 */
1591void remove_percpu_irq(unsigned int irq, struct irqaction *act)
1592{
1593 struct irq_desc *desc = irq_to_desc(irq);
1594
1595 if (desc && irq_settings_is_per_cpu_devid(desc))
1596 __free_percpu_irq(irq, act->percpu_dev_id);
1597}
1598
1599/**
1600 * free_percpu_irq - free an interrupt allocated with request_percpu_irq
1601 * @irq: Interrupt line to free
1602 * @dev_id: Device identity to free
1603 *
1604 * Remove a percpu interrupt handler. The handler is removed, but
1605 * the interrupt line is not disabled. This must be done on each
1606 * CPU before calling this function. The function does not return
1607 * until any executing interrupts for this IRQ have completed.
1608 *
1609 * This function must not be called from interrupt context.
1610 */
1611void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1612{
1613 struct irq_desc *desc = irq_to_desc(irq);
1614
1615 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1616 return;
1617
1618 chip_bus_lock(desc);
1619 kfree(__free_percpu_irq(irq, dev_id));
1620 chip_bus_sync_unlock(desc);
1621}
1622
1623/**
1624 * setup_percpu_irq - setup a per-cpu interrupt
1625 * @irq: Interrupt line to setup
1626 * @act: irqaction for the interrupt
1627 *
1628 * Used to statically setup per-cpu interrupts in the early boot process.
1629 */
1630int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1631{
1632 struct irq_desc *desc = irq_to_desc(irq);
1633 int retval;
1634
1635 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1636 return -EINVAL;
1637 chip_bus_lock(desc);
1638 retval = __setup_irq(irq, desc, act);
1639 chip_bus_sync_unlock(desc);
1640
1641 return retval;
1642}
1643
1644/**
1645 * request_percpu_irq - allocate a percpu interrupt line
1646 * @irq: Interrupt line to allocate
1647 * @handler: Function to be called when the IRQ occurs.
1648 * @devname: An ascii name for the claiming device
1649 * @dev_id: A percpu cookie passed back to the handler function
1650 *
1651 * This call allocates interrupt resources, but doesn't
1652 * automatically enable the interrupt. It has to be done on each
1653 * CPU using enable_percpu_irq().
1654 *
1655 * Dev_id must be globally unique. It is a per-cpu variable, and
1656 * the handler gets called with the interrupted CPU's instance of
1657 * that variable.
1658 */
1659int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1660 const char *devname, void __percpu *dev_id)
1661{
1662 struct irqaction *action;
1663 struct irq_desc *desc;
1664 int retval;
1665
1666 if (!dev_id)
1667 return -EINVAL;
1668
1669 desc = irq_to_desc(irq);
1670 if (!desc || !irq_settings_can_request(desc) ||
1671 !irq_settings_is_per_cpu_devid(desc))
1672 return -EINVAL;
1673
1674 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1675 if (!action)
1676 return -ENOMEM;
1677
1678 action->handler = handler;
1679 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
1680 action->name = devname;
1681 action->percpu_dev_id = dev_id;
1682
1683 chip_bus_lock(desc);
1684 retval = __setup_irq(irq, desc, action);
1685 chip_bus_sync_unlock(desc);
1686
1687 if (retval)
1688 kfree(action);
1689
1690 return retval;
1691}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff70..47420908fba 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,8 +42,13 @@ void irq_move_masked_irq(struct irq_data *idata)
42 * For correct operation this depends on the caller 42 * For correct operation this depends on the caller
43 * masking the irqs. 43 * masking the irqs.
44 */ 44 */
45 if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
46 irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); 46 < nr_cpu_ids))
47 if (!chip->irq_set_affinity(&desc->irq_data,
48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
50 irq_set_thread_affinity(desc);
51 }
47 52
48 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
49} 54}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf2176..fe4b09cf829 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,14 +103,14 @@ int check_wakeup_irqs(void)
103 int irq; 103 int irq;
104 104
105 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
111 if (irqd_is_wakeup_set(&desc->irq_data)) { 106 if (irqd_is_wakeup_set(&desc->irq_data)) {
112 if (desc->depth == 1 && desc->istate & IRQS_PENDING) 107 if (desc->istate & IRQS_PENDING) {
108 pr_info("Wakeup IRQ %d %s pending, suspend aborted\n",
109 irq,
110 desc->action && desc->action->name ?
111 desc->action->name : "");
113 return -EBUSY; 112 return -EBUSY;
113 }
114 continue; 114 continue;
115 } 115 }
116 /* 116 /*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083..ef60772d2fe 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,33 +55,23 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 /*
59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still
61 * active. Clear the pending bit so suspend/resume does not
62 * get confused.
63 */
64 if (irq_settings_is_level(desc)) {
65 desc->istate &= ~IRQS_PENDING;
66 return;
67 }
68 if (desc->istate & IRQS_REPLAY)
69 return;
70 if (desc->istate & IRQS_PENDING) { 58 if (desc->istate & IRQS_PENDING) {
71 desc->istate &= ~IRQS_PENDING; 59 desc->istate &= ~IRQS_PENDING;
60 /*
61 * We do not resend level type interrupts. Level type
62 * interrupts are resent by hardware when they are still
63 * active.
64 */
65 if (irq_settings_is_level(desc))
66 return;
67 if (desc->istate & IRQS_REPLAY)
68 return;
69
72 desc->istate |= IRQS_REPLAY; 70 desc->istate |= IRQS_REPLAY;
73 71
74 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 73 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
77 /*
78 * If the interrupt has a parent irq and runs
79 * in the thread context of the parent irq,
80 * retrigger the parent.
81 */
82 if (desc->parent_irq &&
83 irq_settings_is_nested_thread(desc))
84 irq = desc->parent_irq;
85 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
86 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
87 tasklet_schedule(&resend_tasklet); 77 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f1..f1667833d44 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,7 +13,6 @@ enum {
13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 16 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
18}; 17};
19 18
@@ -25,7 +24,6 @@ enum {
25#define IRQ_NOTHREAD GOT_YOU_MORON 24#define IRQ_NOTHREAD GOT_YOU_MORON
26#define IRQ_NOAUTOEN GOT_YOU_MORON 25#define IRQ_NOAUTOEN GOT_YOU_MORON
27#define IRQ_NESTED_THREAD GOT_YOU_MORON 26#define IRQ_NESTED_THREAD GOT_YOU_MORON
28#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
29#undef IRQF_MODIFY_MASK 27#undef IRQF_MODIFY_MASK
30#define IRQF_MODIFY_MASK GOT_YOU_MORON 28#define IRQF_MODIFY_MASK GOT_YOU_MORON
31 29
@@ -41,11 +39,6 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
41 return desc->status_use_accessors & _IRQ_PER_CPU; 39 return desc->status_use_accessors & _IRQ_PER_CPU;
42} 40}
43 41
44static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
45{
46 return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
47}
48
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc) 42static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{ 43{
51 desc->status_use_accessors |= _IRQ_PER_CPU; 44 desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c4..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
325 desc->irqs_unhandled = 0; 325 desc->irqs_unhandled = 0;
326} 326}
327 327
328bool noirqdebug __read_mostly; 328int noirqdebug __read_mostly;
329 329
330int noirqdebug_setup(char *str) 330int noirqdebug_setup(char *str)
331{ 331{
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871..c58fa7da8ae 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,14 +5,10 @@
5 * context. The enqueueing is NMI-safe. 5 * context. The enqueueing is NMI-safe.
6 */ 6 */
7 7
8#include <linux/bug.h>
9#include <linux/kernel.h> 8#include <linux/kernel.h>
10#include <linux/export.h> 9#include <linux/module.h>
11#include <linux/irq_work.h> 10#include <linux/irq_work.h>
12#include <linux/percpu.h>
13#include <linux/hardirq.h> 11#include <linux/hardirq.h>
14#include <linux/irqflags.h>
15#include <asm/processor.h>
16 12
17/* 13/*
18 * An entry can be in one of four states: 14 * An entry can be in one of four states:
@@ -21,34 +17,54 @@
21 * claimed NULL, 3 -> {pending} : claimed to be enqueued 17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
22 * pending next, 3 -> {busy} : queued, pending callback 18 * pending next, 3 -> {busy} : queued, pending callback
23 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
24 */ 23 */
25 24
26#define IRQ_WORK_PENDING 1UL 25#define IRQ_WORK_PENDING 1UL
27#define IRQ_WORK_BUSY 2UL 26#define IRQ_WORK_BUSY 2UL
28#define IRQ_WORK_FLAGS 3UL 27#define IRQ_WORK_FLAGS 3UL
29 28
30static DEFINE_PER_CPU(struct llist_head, irq_work_list); 29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
31 49
32/* 50/*
33 * Claim the entry so that no one else will poke at it. 51 * Claim the entry so that no one else will poke at it.
34 */ 52 */
35static bool irq_work_claim(struct irq_work *work) 53static bool irq_work_claim(struct irq_work *entry)
36{ 54{
37 unsigned long flags, nflags; 55 struct irq_work *next, *nflags;
38 56
39 for (;;) { 57 do {
40 flags = work->flags; 58 next = entry->next;
41 if (flags & IRQ_WORK_PENDING) 59 if ((unsigned long)next & IRQ_WORK_PENDING)
42 return false; 60 return false;
43 nflags = flags | IRQ_WORK_FLAGS; 61 nflags = next_flags(next, IRQ_WORK_FLAGS);
44 if (cmpxchg(&work->flags, flags, nflags) == flags) 62 } while (cmpxchg(&entry->next, next, nflags) != next);
45 break;
46 cpu_relax();
47 }
48 63
49 return true; 64 return true;
50} 65}
51 66
67
52void __weak arch_irq_work_raise(void) 68void __weak arch_irq_work_raise(void)
53{ 69{
54 /* 70 /*
@@ -59,15 +75,20 @@ void __weak arch_irq_work_raise(void)
59/* 75/*
60 * Queue the entry and raise the IPI if needed. 76 * Queue the entry and raise the IPI if needed.
61 */ 77 */
62static void __irq_work_queue(struct irq_work *work) 78static void __irq_work_queue(struct irq_work *entry)
63{ 79{
64 bool empty; 80 struct irq_work *next;
65 81
66 preempt_disable(); 82 preempt_disable();
67 83
68 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 84 do {
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
69 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
70 if (empty) 91 if (!irq_work_next(entry))
71 arch_irq_work_raise(); 92 arch_irq_work_raise();
72 93
73 preempt_enable(); 94 preempt_enable();
@@ -79,16 +100,16 @@ static void __irq_work_queue(struct irq_work *work)
79 * 100 *
80 * Can be re-enqueued while the callback is still in progress. 101 * Can be re-enqueued while the callback is still in progress.
81 */ 102 */
82bool irq_work_queue(struct irq_work *work) 103bool irq_work_queue(struct irq_work *entry)
83{ 104{
84 if (!irq_work_claim(work)) { 105 if (!irq_work_claim(entry)) {
85 /* 106 /*
86 * Already enqueued, can't do! 107 * Already enqueued, can't do!
87 */ 108 */
88 return false; 109 return false;
89 } 110 }
90 111
91 __irq_work_queue(work); 112 __irq_work_queue(entry);
92 return true; 113 return true;
93} 114}
94EXPORT_SYMBOL_GPL(irq_work_queue); 115EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -99,34 +120,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
99 */ 120 */
100void irq_work_run(void) 121void irq_work_run(void)
101{ 122{
102 struct irq_work *work; 123 struct irq_work *list;
103 struct llist_head *this_list;
104 struct llist_node *llnode;
105 124
106 this_list = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
107 if (llist_empty(this_list))
108 return; 126 return;
109 127
110 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
111 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
112 130
113 llnode = llist_del_all(this_list); 131 list = this_cpu_xchg(irq_work_list, NULL);
114 while (llnode != NULL) { 132
115 work = llist_entry(llnode, struct irq_work, llnode); 133 while (list != NULL) {
134 struct irq_work *entry = list;
116 135
117 llnode = llist_next(llnode); 136 list = irq_work_next(list);
118 137
119 /* 138 /*
120 * Clear the PENDING bit, after this point the @work 139 * Clear the PENDING bit, after this point the @entry
121 * can be re-used. 140 * can be re-used.
122 */ 141 */
123 work->flags = IRQ_WORK_BUSY; 142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
124 work->func(work); 143 entry->func(entry);
125 /* 144 /*
126 * Clear the BUSY bit and return to the free state if 145 * Clear the BUSY bit and return to the free state if
127 * no-one else claimed it meanwhile. 146 * no-one else claimed it meanwhile.
128 */ 147 */
129 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); 148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
130 } 151 }
131} 152}
132EXPORT_SYMBOL_GPL(irq_work_run); 153EXPORT_SYMBOL_GPL(irq_work_run);
@@ -135,11 +156,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
135 * Synchronize against the irq_work @entry, ensures the entry is not 156 * Synchronize against the irq_work @entry, ensures the entry is not
136 * currently in use. 157 * currently in use.
137 */ 158 */
138void irq_work_sync(struct irq_work *work) 159void irq_work_sync(struct irq_work *entry)
139{ 160{
140 WARN_ON_ONCE(irqs_disabled()); 161 WARN_ON_ONCE(irqs_disabled());
141 162
142 while (work->flags & IRQ_WORK_BUSY) 163 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
143 cpu_relax(); 164 cpu_relax();
144} 165}
145EXPORT_SYMBOL_GPL(irq_work_sync); 166EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 8d262b46757..d802883153d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (cval) { 55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime.utime + cputime.stime; 61 t = cputime_add(cputime.utime, cputime.stime);
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cval < t) 66 if (cputime_le(cval, t))
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cval - t; 70 cval = cputime_sub(cval, t);
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,9 +161,10 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (cval || nval) { 164 if (!cputime_eq(cval, cputime_zero) ||
165 if (nval > 0) 165 !cputime_eq(nval, cputime_zero)) {
166 nval += cputime_one_jiffy; 166 if (cputime_gt(nval, cputime_zero))
167 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 168 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
@@ -284,12 +285,8 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
284 if (value) { 285 if (value) {
285 if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) 286 if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
286 return -EFAULT; 287 return -EFAULT;
287 } else { 288 } else
288 memset(&set_buffer, 0, sizeof(set_buffer)); 289 memset((char *) &set_buffer, 0, sizeof(set_buffer));
289 printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
290 " Misfeature support will be removed\n",
291 current->comm);
292 }
293 290
294 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); 291 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
295 if (error || !ovalue) 292 if (error || !ovalue)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0..e6f1f24ad57 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/static_key.h> 15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
@@ -29,6 +29,11 @@ void jump_label_unlock(void)
29 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
30} 30}
31 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
32static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
33{ 38{
34 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -53,73 +58,29 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
53 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
54} 59}
55 60
56static void jump_label_update(struct static_key *key, int enable); 61static void jump_label_update(struct jump_label_key *key, int enable);
57 62
58void static_key_slow_inc(struct static_key *key) 63void jump_label_inc(struct jump_label_key *key)
59{ 64{
60 if (atomic_inc_not_zero(&key->enabled)) 65 if (atomic_inc_not_zero(&key->enabled))
61 return; 66 return;
62 67
63 jump_label_lock(); 68 jump_label_lock();
64 if (atomic_read(&key->enabled) == 0) { 69 if (atomic_read(&key->enabled) == 0)
65 if (!jump_label_get_branch_default(key)) 70 jump_label_update(key, JUMP_LABEL_ENABLE);
66 jump_label_update(key, JUMP_LABEL_ENABLE);
67 else
68 jump_label_update(key, JUMP_LABEL_DISABLE);
69 }
70 atomic_inc(&key->enabled); 71 atomic_inc(&key->enabled);
71 jump_label_unlock(); 72 jump_label_unlock();
72} 73}
73EXPORT_SYMBOL_GPL(static_key_slow_inc);
74 74
75static void __static_key_slow_dec(struct static_key *key, 75void jump_label_dec(struct jump_label_key *key)
76 unsigned long rate_limit, struct delayed_work *work)
77{ 76{
78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { 77 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
79 WARN(atomic_read(&key->enabled) < 0,
80 "jump label: negative count!\n");
81 return; 78 return;
82 }
83 79
84 if (rate_limit) { 80 jump_label_update(key, JUMP_LABEL_DISABLE);
85 atomic_inc(&key->enabled);
86 schedule_delayed_work(work, rate_limit);
87 } else {
88 if (!jump_label_get_branch_default(key))
89 jump_label_update(key, JUMP_LABEL_DISABLE);
90 else
91 jump_label_update(key, JUMP_LABEL_ENABLE);
92 }
93 jump_label_unlock(); 81 jump_label_unlock();
94} 82}
95 83
96static void jump_label_update_timeout(struct work_struct *work)
97{
98 struct static_key_deferred *key =
99 container_of(work, struct static_key_deferred, work.work);
100 __static_key_slow_dec(&key->key, 0, NULL);
101}
102
103void static_key_slow_dec(struct static_key *key)
104{
105 __static_key_slow_dec(key, 0, NULL);
106}
107EXPORT_SYMBOL_GPL(static_key_slow_dec);
108
109void static_key_slow_dec_deferred(struct static_key_deferred *key)
110{
111 __static_key_slow_dec(&key->key, key->timeout, &key->work);
112}
113EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
114
115void jump_label_rate_limit(struct static_key_deferred *key,
116 unsigned long rl)
117{
118 key->timeout = rl;
119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
120}
121EXPORT_SYMBOL_GPL(jump_label_rate_limit);
122
123static int addr_conflict(struct jump_entry *entry, void *start, void *end) 84static int addr_conflict(struct jump_entry *entry, void *start, void *end)
124{ 85{
125 if (entry->code <= (unsigned long)end && 86 if (entry->code <= (unsigned long)end &&
@@ -144,19 +105,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
144 return 0; 105 return 0;
145} 106}
146 107
147/* 108static void __jump_label_update(struct jump_label_key *key,
148 * Update code which is definitely not currently executing.
149 * Architectures which need heavyweight synchronization to modify
150 * running code can override this to make the non-live update case
151 * cheaper.
152 */
153void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
154 enum jump_label_type type)
155{
156 arch_jump_label_transform(entry, type);
157}
158
159static void __jump_label_update(struct static_key *key,
160 struct jump_entry *entry, 109 struct jump_entry *entry,
161 struct jump_entry *stop, int enable) 110 struct jump_entry *stop, int enable)
162{ 111{
@@ -173,51 +122,45 @@ static void __jump_label_update(struct static_key *key,
173 } 122 }
174} 123}
175 124
176static enum jump_label_type jump_label_type(struct static_key *key) 125/*
126 * Not all archs need this.
127 */
128void __weak arch_jump_label_text_poke_early(jump_label_t addr)
177{ 129{
178 bool true_branch = jump_label_get_branch_default(key);
179 bool state = static_key_enabled(key);
180
181 if ((!true_branch && state) || (true_branch && !state))
182 return JUMP_LABEL_ENABLE;
183
184 return JUMP_LABEL_DISABLE;
185} 130}
186 131
187void __init jump_label_init(void) 132static __init int jump_label_init(void)
188{ 133{
189 struct jump_entry *iter_start = __start___jump_table; 134 struct jump_entry *iter_start = __start___jump_table;
190 struct jump_entry *iter_stop = __stop___jump_table; 135 struct jump_entry *iter_stop = __stop___jump_table;
191 struct static_key *key = NULL; 136 struct jump_label_key *key = NULL;
192 struct jump_entry *iter; 137 struct jump_entry *iter;
193 138
194 jump_label_lock(); 139 jump_label_lock();
195 jump_label_sort_entries(iter_start, iter_stop); 140 jump_label_sort_entries(iter_start, iter_stop);
196 141
197 for (iter = iter_start; iter < iter_stop; iter++) { 142 for (iter = iter_start; iter < iter_stop; iter++) {
198 struct static_key *iterk; 143 arch_jump_label_text_poke_early(iter->code);
199 144 if (iter->key == (jump_label_t)(unsigned long)key)
200 iterk = (struct static_key *)(unsigned long)iter->key;
201 arch_jump_label_transform_static(iter, jump_label_type(iterk));
202 if (iterk == key)
203 continue; 145 continue;
204 146
205 key = iterk; 147 key = (struct jump_label_key *)(unsigned long)iter->key;
206 /* 148 atomic_set(&key->enabled, 0);
207 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. 149 key->entries = iter;
208 */
209 *((unsigned long *)&key->entries) += (unsigned long)iter;
210#ifdef CONFIG_MODULES 150#ifdef CONFIG_MODULES
211 key->next = NULL; 151 key->next = NULL;
212#endif 152#endif
213 } 153 }
214 jump_label_unlock(); 154 jump_label_unlock();
155
156 return 0;
215} 157}
158early_initcall(jump_label_init);
216 159
217#ifdef CONFIG_MODULES 160#ifdef CONFIG_MODULES
218 161
219struct static_key_mod { 162struct jump_label_mod {
220 struct static_key_mod *next; 163 struct jump_label_mod *next;
221 struct jump_entry *entries; 164 struct jump_entry *entries;
222 struct module *mod; 165 struct module *mod;
223}; 166};
@@ -237,9 +180,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
237 start, end); 180 start, end);
238} 181}
239 182
240static void __jump_label_mod_update(struct static_key *key, int enable) 183static void __jump_label_mod_update(struct jump_label_key *key, int enable)
241{ 184{
242 struct static_key_mod *mod = key->next; 185 struct jump_label_mod *mod = key->next;
243 186
244 while (mod) { 187 while (mod) {
245 struct module *m = mod->mod; 188 struct module *m = mod->mod;
@@ -269,9 +212,8 @@ void jump_label_apply_nops(struct module *mod)
269 if (iter_start == iter_stop) 212 if (iter_start == iter_stop)
270 return; 213 return;
271 214
272 for (iter = iter_start; iter < iter_stop; iter++) { 215 for (iter = iter_start; iter < iter_stop; iter++)
273 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 216 arch_jump_label_text_poke_early(iter->code);
274 }
275} 217}
276 218
277static int jump_label_add_module(struct module *mod) 219static int jump_label_add_module(struct module *mod)
@@ -279,8 +221,8 @@ static int jump_label_add_module(struct module *mod)
279 struct jump_entry *iter_start = mod->jump_entries; 221 struct jump_entry *iter_start = mod->jump_entries;
280 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 222 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
281 struct jump_entry *iter; 223 struct jump_entry *iter;
282 struct static_key *key = NULL; 224 struct jump_label_key *key = NULL;
283 struct static_key_mod *jlm; 225 struct jump_label_mod *jlm;
284 226
285 /* if the module doesn't have jump label entries, just return */ 227 /* if the module doesn't have jump label entries, just return */
286 if (iter_start == iter_stop) 228 if (iter_start == iter_stop)
@@ -289,31 +231,30 @@ static int jump_label_add_module(struct module *mod)
289 jump_label_sort_entries(iter_start, iter_stop); 231 jump_label_sort_entries(iter_start, iter_stop);
290 232
291 for (iter = iter_start; iter < iter_stop; iter++) { 233 for (iter = iter_start; iter < iter_stop; iter++) {
292 struct static_key *iterk; 234 if (iter->key == (jump_label_t)(unsigned long)key)
293
294 iterk = (struct static_key *)(unsigned long)iter->key;
295 if (iterk == key)
296 continue; 235 continue;
297 236
298 key = iterk; 237 key = (struct jump_label_key *)(unsigned long)iter->key;
238
299 if (__module_address(iter->key) == mod) { 239 if (__module_address(iter->key) == mod) {
300 /* 240 atomic_set(&key->enabled, 0);
301 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. 241 key->entries = iter;
302 */
303 *((unsigned long *)&key->entries) += (unsigned long)iter;
304 key->next = NULL; 242 key->next = NULL;
305 continue; 243 continue;
306 } 244 }
307 jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); 245
246 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
308 if (!jlm) 247 if (!jlm)
309 return -ENOMEM; 248 return -ENOMEM;
249
310 jlm->mod = mod; 250 jlm->mod = mod;
311 jlm->entries = iter; 251 jlm->entries = iter;
312 jlm->next = key->next; 252 jlm->next = key->next;
313 key->next = jlm; 253 key->next = jlm;
314 254
315 if (jump_label_type(key) == JUMP_LABEL_ENABLE) 255 if (jump_label_enabled(key))
316 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); 256 __jump_label_update(key, iter, iter_stop,
257 JUMP_LABEL_ENABLE);
317 } 258 }
318 259
319 return 0; 260 return 0;
@@ -324,14 +265,14 @@ static void jump_label_del_module(struct module *mod)
324 struct jump_entry *iter_start = mod->jump_entries; 265 struct jump_entry *iter_start = mod->jump_entries;
325 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 266 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
326 struct jump_entry *iter; 267 struct jump_entry *iter;
327 struct static_key *key = NULL; 268 struct jump_label_key *key = NULL;
328 struct static_key_mod *jlm, **prev; 269 struct jump_label_mod *jlm, **prev;
329 270
330 for (iter = iter_start; iter < iter_stop; iter++) { 271 for (iter = iter_start; iter < iter_stop; iter++) {
331 if (iter->key == (jump_label_t)(unsigned long)key) 272 if (iter->key == (jump_label_t)(unsigned long)key)
332 continue; 273 continue;
333 274
334 key = (struct static_key *)(unsigned long)iter->key; 275 key = (struct jump_label_key *)(unsigned long)iter->key;
335 276
336 if (__module_address(iter->key) == mod) 277 if (__module_address(iter->key) == mod)
337 continue; 278 continue;
@@ -433,13 +374,12 @@ int jump_label_text_reserved(void *start, void *end)
433 return ret; 374 return ret;
434} 375}
435 376
436static void jump_label_update(struct static_key *key, int enable) 377static void jump_label_update(struct jump_label_key *key, int enable)
437{ 378{
438 struct jump_entry *stop = __stop___jump_table; 379 struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
439 struct jump_entry *entry = jump_label_get_entries(key);
440 380
441#ifdef CONFIG_MODULES 381#ifdef CONFIG_MODULES
442 struct module *mod = __module_address((unsigned long)key); 382 struct module *mod = __module_address((jump_label_t)key);
443 383
444 __jump_label_mod_update(key, enable); 384 __jump_label_mod_update(key, enable);
445 385
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba52..079f1d39a8b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345static int __sprint_symbol(char *buffer, unsigned long address, 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset, int add_offset) 346 int symbol_offset)
347{ 347{
348 char *modname; 348 char *modname;
349 const char *name; 349 const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
358 if (name != buffer) 358 if (name != buffer)
359 strcpy(buffer, name); 359 strcpy(buffer, name);
360 len = strlen(buffer); 360 len = strlen(buffer);
361 buffer += len;
361 offset -= symbol_offset; 362 offset -= symbol_offset;
362 363
363 if (add_offset)
364 len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
365
366 if (modname) 364 if (modname)
367 len += sprintf(buffer + len, " [%s]", modname); 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
366 else
367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
368 368
369 return len; 369 return len;
370} 370}
@@ -382,26 +382,10 @@ static int __sprint_symbol(char *buffer, unsigned long address,
382 */ 382 */
383int sprint_symbol(char *buffer, unsigned long address) 383int sprint_symbol(char *buffer, unsigned long address)
384{ 384{
385 return __sprint_symbol(buffer, address, 0, 1); 385 return __sprint_symbol(buffer, address, 0);
386} 386}
387EXPORT_SYMBOL_GPL(sprint_symbol);
388 387
389/** 388EXPORT_SYMBOL_GPL(sprint_symbol);
390 * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
391 * @buffer: buffer to be stored
392 * @address: address to lookup
393 *
394 * This function looks up a kernel symbol with @address and stores its name
395 * and module name to @buffer if possible. If no symbol was found, just saves
396 * its @address as is.
397 *
398 * This function returns the number of bytes stored in @buffer.
399 */
400int sprint_symbol_no_offset(char *buffer, unsigned long address)
401{
402 return __sprint_symbol(buffer, address, 0, 0);
403}
404EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
405 389
406/** 390/**
407 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer 391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
@@ -419,7 +403,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
419 */ 403 */
420int sprint_backtrace(char *buffer, unsigned long address) 404int sprint_backtrace(char *buffer, unsigned long address)
421{ 405{
422 return __sprint_symbol(buffer, address, -1, 1); 406 return __sprint_symbol(buffer, address, -1);
423} 407}
424 408
425/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
deleted file mode 100644
index e30ac0fe61c..00000000000
--- a/kernel/kcmp.c
+++ /dev/null
@@ -1,197 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/syscalls.h>
3#include <linux/fdtable.h>
4#include <linux/string.h>
5#include <linux/random.h>
6#include <linux/module.h>
7#include <linux/ptrace.h>
8#include <linux/init.h>
9#include <linux/errno.h>
10#include <linux/cache.h>
11#include <linux/bug.h>
12#include <linux/err.h>
13#include <linux/kcmp.h>
14
15#include <asm/unistd.h>
16
17/*
18 * We don't expose the real in-memory order of objects for security reasons.
19 * But still the comparison results should be suitable for sorting. So we
20 * obfuscate kernel pointers values and compare the production instead.
21 *
22 * The obfuscation is done in two steps. First we xor the kernel pointer with
23 * a random value, which puts pointer into a new position in a reordered space.
24 * Secondly we multiply the xor production with a large odd random number to
25 * permute its bits even more (the odd multiplier guarantees that the product
26 * is unique ever after the high bits are truncated, since any odd number is
27 * relative prime to 2^n).
28 *
29 * Note also that the obfuscation itself is invisible to userspace and if needed
30 * it can be changed to an alternate scheme.
31 */
32static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
33
34static long kptr_obfuscate(long v, int type)
35{
36 return (v ^ cookies[type][0]) * cookies[type][1];
37}
38
39/*
40 * 0 - equal, i.e. v1 = v2
41 * 1 - less than, i.e. v1 < v2
42 * 2 - greater than, i.e. v1 > v2
43 * 3 - not equal but ordering unavailable (reserved for future)
44 */
45static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
46{
47 long ret;
48
49 ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
50
51 return (ret < 0) | ((ret > 0) << 1);
52}
53
54/* The caller must have pinned the task */
55static struct file *
56get_file_raw_ptr(struct task_struct *task, unsigned int idx)
57{
58 struct file *file = NULL;
59
60 task_lock(task);
61 rcu_read_lock();
62
63 if (task->files)
64 file = fcheck_files(task->files, idx);
65
66 rcu_read_unlock();
67 task_unlock(task);
68
69 return file;
70}
71
72static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
73{
74 if (likely(m2 != m1))
75 mutex_unlock(m2);
76 mutex_unlock(m1);
77}
78
79static int kcmp_lock(struct mutex *m1, struct mutex *m2)
80{
81 int err;
82
83 if (m2 > m1)
84 swap(m1, m2);
85
86 err = mutex_lock_killable(m1);
87 if (!err && likely(m1 != m2)) {
88 err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
89 if (err)
90 mutex_unlock(m1);
91 }
92
93 return err;
94}
95
96SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
97 unsigned long, idx1, unsigned long, idx2)
98{
99 struct task_struct *task1, *task2;
100 int ret;
101
102 rcu_read_lock();
103
104 /*
105 * Tasks are looked up in caller's PID namespace only.
106 */
107 task1 = find_task_by_vpid(pid1);
108 task2 = find_task_by_vpid(pid2);
109 if (!task1 || !task2)
110 goto err_no_task;
111
112 get_task_struct(task1);
113 get_task_struct(task2);
114
115 rcu_read_unlock();
116
117 /*
118 * One should have enough rights to inspect task details.
119 */
120 ret = kcmp_lock(&task1->signal->cred_guard_mutex,
121 &task2->signal->cred_guard_mutex);
122 if (ret)
123 goto err;
124 if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
125 !ptrace_may_access(task2, PTRACE_MODE_READ)) {
126 ret = -EPERM;
127 goto err_unlock;
128 }
129
130 switch (type) {
131 case KCMP_FILE: {
132 struct file *filp1, *filp2;
133
134 filp1 = get_file_raw_ptr(task1, idx1);
135 filp2 = get_file_raw_ptr(task2, idx2);
136
137 if (filp1 && filp2)
138 ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
139 else
140 ret = -EBADF;
141 break;
142 }
143 case KCMP_VM:
144 ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
145 break;
146 case KCMP_FILES:
147 ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
148 break;
149 case KCMP_FS:
150 ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
151 break;
152 case KCMP_SIGHAND:
153 ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
154 break;
155 case KCMP_IO:
156 ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
157 break;
158 case KCMP_SYSVSEM:
159#ifdef CONFIG_SYSVIPC
160 ret = kcmp_ptr(task1->sysvsem.undo_list,
161 task2->sysvsem.undo_list,
162 KCMP_SYSVSEM);
163#else
164 ret = -EOPNOTSUPP;
165#endif
166 break;
167 default:
168 ret = -EINVAL;
169 break;
170 }
171
172err_unlock:
173 kcmp_unlock(&task1->signal->cred_guard_mutex,
174 &task2->signal->cred_guard_mutex);
175err:
176 put_task_struct(task1);
177 put_task_struct(task2);
178
179 return ret;
180
181err_no_task:
182 rcu_read_unlock();
183 return -ESRCH;
184}
185
186static __init int kcmp_cookies_init(void)
187{
188 int i;
189
190 get_random_bytes(cookies, sizeof(cookies));
191
192 for (i = 0; i < KCMP_TYPES; i++)
193 cookies[i][1] |= (~(~0UL >> 1) | 1);
194
195 return 0;
196}
197arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5..296fbc84d65 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,6 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <generated/utsrelease.h>
24#include <linux/utsname.h> 25#include <linux/utsname.h>
25#include <linux/numa.h> 26#include <linux/numa.h>
26#include <linux/suspend.h> 27#include <linux/suspend.h>
@@ -31,11 +32,13 @@
31#include <linux/console.h> 32#include <linux/console.h>
32#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
34#include <linux/syscore_ops.h> 36#include <linux/syscore_ops.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/uaccess.h> 39#include <asm/uaccess.h>
38#include <asm/io.h> 40#include <asm/io.h>
41#include <asm/system.h>
39#include <asm/sections.h> 42#include <asm/sections.h>
40 43
41/* Per cpu memory for storing cpu states in case of system crash. */ 44/* Per cpu memory for storing cpu states in case of system crash. */
@@ -495,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
495 while (hole_end <= crashk_res.end) { 498 while (hole_end <= crashk_res.end) {
496 unsigned long i; 499 unsigned long i;
497 500
498 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 501 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
499 break; 502 break;
500 if (hole_end > crashk_res.end) 503 if (hole_end > crashk_res.end)
501 break; 504 break;
@@ -996,7 +999,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
996 kimage_free(xchg(&kexec_crash_image, NULL)); 999 kimage_free(xchg(&kexec_crash_image, NULL));
997 result = kimage_crash_alloc(&image, entry, 1000 result = kimage_crash_alloc(&image, entry,
998 nr_segments, segments); 1001 nr_segments, segments);
999 crash_map_reserved_pages();
1000 } 1002 }
1001 if (result) 1003 if (result)
1002 goto out; 1004 goto out;
@@ -1013,8 +1015,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1013 goto out; 1015 goto out;
1014 } 1016 }
1015 kimage_terminate(image); 1017 kimage_terminate(image);
1016 if (flags & KEXEC_ON_CRASH)
1017 crash_unmap_reserved_pages();
1018 } 1018 }
1019 /* Install the new kernel, and Uninstall the old */ 1019 /* Install the new kernel, and Uninstall the old */
1020 image = xchg(dest_image, image); 1020 image = xchg(dest_image, image);
@@ -1026,18 +1026,6 @@ out:
1026 return result; 1026 return result;
1027} 1027}
1028 1028
1029/*
1030 * Add and remove page tables for crashkernel memory
1031 *
1032 * Provide an empty default implementation here -- architecture
1033 * code may override this
1034 */
1035void __weak crash_map_reserved_pages(void)
1036{}
1037
1038void __weak crash_unmap_reserved_pages(void)
1039{}
1040
1041#ifdef CONFIG_COMPAT 1029#ifdef CONFIG_COMPAT
1042asmlinkage long compat_sys_kexec_load(unsigned long entry, 1030asmlinkage long compat_sys_kexec_load(unsigned long entry,
1043 unsigned long nr_segments, 1031 unsigned long nr_segments,
@@ -1091,6 +1079,8 @@ void crash_kexec(struct pt_regs *regs)
1091 if (kexec_crash_image) { 1079 if (kexec_crash_image) {
1092 struct pt_regs fixed_regs; 1080 struct pt_regs fixed_regs;
1093 1081
1082 kmsg_dump(KMSG_DUMP_KEXEC);
1083
1094 crash_setup_regs(&fixed_regs, regs); 1084 crash_setup_regs(&fixed_regs, regs);
1095 crash_save_vmcoreinfo(); 1085 crash_save_vmcoreinfo();
1096 machine_crash_shutdown(&fixed_regs); 1086 machine_crash_shutdown(&fixed_regs);
@@ -1127,8 +1117,6 @@ int crash_shrink_memory(unsigned long new_size)
1127{ 1117{
1128 int ret = 0; 1118 int ret = 0;
1129 unsigned long start, end; 1119 unsigned long start, end;
1130 unsigned long old_size;
1131 struct resource *ram_res;
1132 1120
1133 mutex_lock(&kexec_mutex); 1121 mutex_lock(&kexec_mutex);
1134 1122
@@ -1138,37 +1126,23 @@ int crash_shrink_memory(unsigned long new_size)
1138 } 1126 }
1139 start = crashk_res.start; 1127 start = crashk_res.start;
1140 end = crashk_res.end; 1128 end = crashk_res.end;
1141 old_size = (end == 0) ? 0 : end - start + 1;
1142 if (new_size >= old_size) {
1143 ret = (new_size == old_size) ? 0 : -EINVAL;
1144 goto unlock;
1145 }
1146 1129
1147 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); 1130 if (new_size >= end - start + 1) {
1148 if (!ram_res) { 1131 ret = -EINVAL;
1149 ret = -ENOMEM; 1132 if (new_size == end - start + 1)
1133 ret = 0;
1150 goto unlock; 1134 goto unlock;
1151 } 1135 }
1152 1136
1153 start = roundup(start, KEXEC_CRASH_MEM_ALIGN); 1137 start = roundup(start, PAGE_SIZE);
1154 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); 1138 end = roundup(start + new_size, PAGE_SIZE);
1155 1139
1156 crash_map_reserved_pages();
1157 crash_free_reserved_phys_range(end, crashk_res.end); 1140 crash_free_reserved_phys_range(end, crashk_res.end);
1158 1141
1159 if ((start == end) && (crashk_res.parent != NULL)) 1142 if ((start == end) && (crashk_res.parent != NULL))
1160 release_resource(&crashk_res); 1143 release_resource(&crashk_res);
1161
1162 ram_res->start = end;
1163 ram_res->end = crashk_res.end;
1164 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1165 ram_res->name = "System RAM";
1166
1167 crashk_res.end = end - 1; 1144 crashk_res.end = end - 1;
1168 1145
1169 insert_resource(&iomem_resource, ram_res);
1170 crash_unmap_reserved_pages();
1171
1172unlock: 1146unlock:
1173 mutex_unlock(&kexec_mutex); 1147 mutex_unlock(&kexec_mutex);
1174 return ret; 1148 return ret;
@@ -1357,10 +1331,6 @@ static int __init parse_crashkernel_simple(char *cmdline,
1357 1331
1358 if (*cur == '@') 1332 if (*cur == '@')
1359 *crash_base = memparse(cur+1, &cur); 1333 *crash_base = memparse(cur+1, &cur);
1360 else if (*cur != ' ' && *cur != '\0') {
1361 pr_warning("crashkernel: unrecognized char\n");
1362 return -EINVAL;
1363 }
1364 1334
1365 return 0; 1335 return 0;
1366} 1336}
@@ -1410,21 +1380,22 @@ int __init parse_crashkernel(char *cmdline,
1410} 1380}
1411 1381
1412 1382
1413static void update_vmcoreinfo_note(void) 1383
1384void crash_save_vmcoreinfo(void)
1414{ 1385{
1415 u32 *buf = vmcoreinfo_note; 1386 u32 *buf;
1416 1387
1417 if (!vmcoreinfo_size) 1388 if (!vmcoreinfo_size)
1418 return; 1389 return;
1390
1391 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1392
1393 buf = (u32 *)vmcoreinfo_note;
1394
1419 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1395 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1420 vmcoreinfo_size); 1396 vmcoreinfo_size);
1421 final_note(buf);
1422}
1423 1397
1424void crash_save_vmcoreinfo(void) 1398 final_note(buf);
1425{
1426 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1427 update_vmcoreinfo_note();
1428} 1399}
1429 1400
1430void vmcoreinfo_append_str(const char *fmt, ...) 1401void vmcoreinfo_append_str(const char *fmt, ...)
@@ -1464,9 +1435,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1464 1435
1465 VMCOREINFO_SYMBOL(init_uts_ns); 1436 VMCOREINFO_SYMBOL(init_uts_ns);
1466 VMCOREINFO_SYMBOL(node_online_map); 1437 VMCOREINFO_SYMBOL(node_online_map);
1467#ifdef CONFIG_MMU
1468 VMCOREINFO_SYMBOL(swapper_pg_dir); 1438 VMCOREINFO_SYMBOL(swapper_pg_dir);
1469#endif
1470 VMCOREINFO_SYMBOL(_stext); 1439 VMCOREINFO_SYMBOL(_stext);
1471 VMCOREINFO_SYMBOL(vmlist); 1440 VMCOREINFO_SYMBOL(vmlist);
1472 1441
@@ -1514,7 +1483,6 @@ static int __init crash_save_vmcoreinfo_init(void)
1514 VMCOREINFO_NUMBER(PG_swapcache); 1483 VMCOREINFO_NUMBER(PG_swapcache);
1515 1484
1516 arch_crash_save_vmcoreinfo(); 1485 arch_crash_save_vmcoreinfo();
1517 update_vmcoreinfo_note();
1518 1486
1519 return 0; 1487 return 0;
1520} 1488}
@@ -1538,7 +1506,7 @@ int kernel_kexec(void)
1538 1506
1539#ifdef CONFIG_KEXEC_JUMP 1507#ifdef CONFIG_KEXEC_JUMP
1540 if (kexec_image->preserve_context) { 1508 if (kexec_image->preserve_context) {
1541 lock_system_sleep(); 1509 mutex_lock(&pm_mutex);
1542 pm_prepare_console(); 1510 pm_prepare_console();
1543 error = freeze_processes(); 1511 error = freeze_processes();
1544 if (error) { 1512 if (error) {
@@ -1550,13 +1518,13 @@ int kernel_kexec(void)
1550 if (error) 1518 if (error)
1551 goto Resume_console; 1519 goto Resume_console;
1552 /* At this point, dpm_suspend_start() has been called, 1520 /* At this point, dpm_suspend_start() has been called,
1553 * but *not* dpm_suspend_end(). We *must* call 1521 * but *not* dpm_suspend_noirq(). We *must* call
1554 * dpm_suspend_end() now. Otherwise, drivers for 1522 * dpm_suspend_noirq() now. Otherwise, drivers for
1555 * some devices (e.g. interrupt controllers) become 1523 * some devices (e.g. interrupt controllers) become
1556 * desynchronized with the actual state of the 1524 * desynchronized with the actual state of the
1557 * hardware at resume time, and evil weirdness ensues. 1525 * hardware at resume time, and evil weirdness ensues.
1558 */ 1526 */
1559 error = dpm_suspend_end(PMSG_FREEZE); 1527 error = dpm_suspend_noirq(PMSG_FREEZE);
1560 if (error) 1528 if (error)
1561 goto Resume_devices; 1529 goto Resume_devices;
1562 error = disable_nonboot_cpus(); 1530 error = disable_nonboot_cpus();
@@ -1583,7 +1551,7 @@ int kernel_kexec(void)
1583 local_irq_enable(); 1551 local_irq_enable();
1584 Enable_cpus: 1552 Enable_cpus:
1585 enable_nonboot_cpus(); 1553 enable_nonboot_cpus();
1586 dpm_resume_start(PMSG_RESTORE); 1554 dpm_resume_noirq(PMSG_RESTORE);
1587 Resume_devices: 1555 Resume_devices:
1588 dpm_resume_end(PMSG_RESTORE); 1556 dpm_resume_end(PMSG_RESTORE);
1589 Resume_console: 1557 Resume_console:
@@ -1591,7 +1559,7 @@ int kernel_kexec(void)
1591 thaw_processes(); 1559 thaw_processes();
1592 Restore_console: 1560 Restore_console:
1593 pm_restore_console(); 1561 pm_restore_console();
1594 unlock_system_sleep(); 1562 mutex_unlock(&pm_mutex);
1595 } 1563 }
1596#endif 1564#endif
1597 1565
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 59dcf5b81d2..01a0700e873 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/export.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/log2.h> 26#include <linux/log2.h>
@@ -402,7 +402,6 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
402 return max; 402 return max;
403 return len; 403 return len;
404} 404}
405EXPORT_SYMBOL(__kfifo_max_r);
406 405
407#define __KFIFO_PEEK(data, out, mask) \ 406#define __KFIFO_PEEK(data, out, mask) \
408 ((data)[(out) & (mask)]) 407 ((data)[(out) & (mask)])
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,8 +36,6 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h>
40#include <linux/ptrace.h>
41#include <asm/uaccess.h> 39#include <asm/uaccess.h>
42 40
43#include <trace/events/module.h> 41#include <trace/events/module.h>
@@ -46,20 +44,12 @@ extern int max_threads;
46 44
47static struct workqueue_struct *khelper_wq; 45static struct workqueue_struct *khelper_wq;
48 46
49/*
50 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
51 * locking to protect this global - it is private to the singleton khelper
52 * thread and should only ever be modified by that thread.
53 */
54static const struct task_struct *kmod_thread_locker;
55
56#define CAP_BSET (void *)1 47#define CAP_BSET (void *)1
57#define CAP_PI (void *)2 48#define CAP_PI (void *)2
58 49
59static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; 50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
60static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
61static DEFINE_SPINLOCK(umh_sysctl_lock); 52static DEFINE_SPINLOCK(umh_sysctl_lock);
62static DECLARE_RWSEM(umhelper_sem);
63 53
64#ifdef CONFIG_MODULES 54#ifdef CONFIG_MODULES
65 55
@@ -68,43 +58,6 @@ static DECLARE_RWSEM(umhelper_sem);
68*/ 58*/
69char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 59char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
70 60
71static void free_modprobe_argv(struct subprocess_info *info)
72{
73 kfree(info->argv[3]); /* check call_modprobe() */
74 kfree(info->argv);
75}
76
77static int call_modprobe(char *module_name, int wait)
78{
79 static char *envp[] = {
80 "HOME=/",
81 "TERM=linux",
82 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
83 NULL
84 };
85
86 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
87 if (!argv)
88 goto out;
89
90 module_name = kstrdup(module_name, GFP_KERNEL);
91 if (!module_name)
92 goto free_argv;
93
94 argv[0] = modprobe_path;
95 argv[1] = "-q";
96 argv[2] = "--";
97 argv[3] = module_name; /* check free_modprobe_argv() */
98 argv[4] = NULL;
99
100 return call_usermodehelper_fns(modprobe_path, argv, envp,
101 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
102free_argv:
103 kfree(argv);
104out:
105 return -ENOMEM;
106}
107
108/** 61/**
109 * __request_module - try to load a kernel module 62 * __request_module - try to load a kernel module
110 * @wait: wait (or not) for the operation to complete 63 * @wait: wait (or not) for the operation to complete
@@ -126,6 +79,11 @@ int __request_module(bool wait, const char *fmt, ...)
126 char module_name[MODULE_NAME_LEN]; 79 char module_name[MODULE_NAME_LEN];
127 unsigned int max_modprobes; 80 unsigned int max_modprobes;
128 int ret; 81 int ret;
82 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
83 static char *envp[] = { "HOME=/",
84 "TERM=linux",
85 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
86 NULL };
129 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 87 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
130#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 88#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
131 static int kmod_loop_msg; 89 static int kmod_loop_msg;
@@ -168,7 +126,9 @@ int __request_module(bool wait, const char *fmt, ...)
168 126
169 trace_module_request(module_name, wait, _RET_IP_); 127 trace_module_request(module_name, wait, _RET_IP_);
170 128
171 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 129 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
130 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
131 NULL, NULL, NULL);
172 132
173 atomic_dec(&kmod_concurrent); 133 atomic_dec(&kmod_concurrent);
174 return ret; 134 return ret;
@@ -219,11 +179,9 @@ static int ____call_usermodehelper(void *data)
219 179
220 commit_creds(new); 180 commit_creds(new);
221 181
222 retval = do_execve(sub_info->path, 182 retval = kernel_execve(sub_info->path,
223 (const char __user *const __user *)sub_info->argv, 183 (const char *const *)sub_info->argv,
224 (const char __user *const __user *)sub_info->envp); 184 (const char *const *)sub_info->envp);
225 if (!retval)
226 return 0;
227 185
228 /* Exec failed? */ 186 /* Exec failed? */
229fail: 187fail:
@@ -231,32 +189,13 @@ fail:
231 do_exit(0); 189 do_exit(0);
232} 190}
233 191
234static int call_helper(void *data) 192void call_usermodehelper_freeinfo(struct subprocess_info *info)
235{
236 /* Worker thread started blocking khelper thread. */
237 kmod_thread_locker = current;
238 return ____call_usermodehelper(data);
239}
240
241static void call_usermodehelper_freeinfo(struct subprocess_info *info)
242{ 193{
243 if (info->cleanup) 194 if (info->cleanup)
244 (*info->cleanup)(info); 195 (*info->cleanup)(info);
245 kfree(info); 196 kfree(info);
246} 197}
247 198EXPORT_SYMBOL(call_usermodehelper_freeinfo);
248static void umh_complete(struct subprocess_info *sub_info)
249{
250 struct completion *comp = xchg(&sub_info->complete, NULL);
251 /*
252 * See call_usermodehelper_exec(). If xchg() returns NULL
253 * we own sub_info, the UMH_KILLABLE caller has gone away.
254 */
255 if (comp)
256 complete(comp);
257 else
258 call_usermodehelper_freeinfo(sub_info);
259}
260 199
261/* Keventd can't block, but this (a child) can. */ 200/* Keventd can't block, but this (a child) can. */
262static int wait_for_helper(void *data) 201static int wait_for_helper(void *data)
@@ -294,8 +233,8 @@ static int wait_for_helper(void *data)
294 sub_info->retval = ret; 233 sub_info->retval = ret;
295 } 234 }
296 235
297 umh_complete(sub_info); 236 complete(sub_info->complete);
298 do_exit(0); 237 return 0;
299} 238}
300 239
301/* This is run by khelper thread */ 240/* This is run by khelper thread */
@@ -303,7 +242,7 @@ static void __call_usermodehelper(struct work_struct *work)
303{ 242{
304 struct subprocess_info *sub_info = 243 struct subprocess_info *sub_info =
305 container_of(work, struct subprocess_info, work); 244 container_of(work, struct subprocess_info, work);
306 int wait = sub_info->wait & ~UMH_KILLABLE; 245 enum umh_wait wait = sub_info->wait;
307 pid_t pid; 246 pid_t pid;
308 247
309 /* CLONE_VFORK: wait until the usermode helper has execve'd 248 /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -312,12 +251,9 @@ static void __call_usermodehelper(struct work_struct *work)
312 if (wait == UMH_WAIT_PROC) 251 if (wait == UMH_WAIT_PROC)
313 pid = kernel_thread(wait_for_helper, sub_info, 252 pid = kernel_thread(wait_for_helper, sub_info,
314 CLONE_FS | CLONE_FILES | SIGCHLD); 253 CLONE_FS | CLONE_FILES | SIGCHLD);
315 else { 254 else
316 pid = kernel_thread(call_helper, sub_info, 255 pid = kernel_thread(____call_usermodehelper, sub_info,
317 CLONE_VFORK | SIGCHLD); 256 CLONE_VFORK | SIGCHLD);
318 /* Worker thread stopped blocking khelper thread. */
319 kmod_thread_locker = NULL;
320 }
321 257
322 switch (wait) { 258 switch (wait) {
323 case UMH_NO_WAIT: 259 case UMH_NO_WAIT:
@@ -331,7 +267,7 @@ static void __call_usermodehelper(struct work_struct *work)
331 case UMH_WAIT_EXEC: 267 case UMH_WAIT_EXEC:
332 if (pid < 0) 268 if (pid < 0)
333 sub_info->retval = pid; 269 sub_info->retval = pid;
334 umh_complete(sub_info); 270 complete(sub_info->complete);
335 } 271 }
336} 272}
337 273
@@ -339,126 +275,33 @@ static void __call_usermodehelper(struct work_struct *work)
339 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 275 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
340 * (used for preventing user land processes from being created after the user 276 * (used for preventing user land processes from being created after the user
341 * land has been frozen during a system-wide hibernation or suspend operation). 277 * land has been frozen during a system-wide hibernation or suspend operation).
342 * Should always be manipulated under umhelper_sem acquired for write.
343 */ 278 */
344static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; 279static int usermodehelper_disabled = 1;
345 280
346/* Number of helpers running */ 281/* Number of helpers running */
347static atomic_t running_helpers = ATOMIC_INIT(0); 282static atomic_t running_helpers = ATOMIC_INIT(0);
348 283
349/* 284/*
350 * Wait queue head used by usermodehelper_disable() to wait for all running 285 * Wait queue head used by usermodehelper_pm_callback() to wait for all running
351 * helpers to finish. 286 * helpers to finish.
352 */ 287 */
353static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 288static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
354 289
355/* 290/*
356 * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
357 * to become 'false'.
358 */
359static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
360
361/*
362 * Time to wait for running_helpers to become zero before the setting of 291 * Time to wait for running_helpers to become zero before the setting of
363 * usermodehelper_disabled in usermodehelper_disable() fails 292 * usermodehelper_disabled in usermodehelper_pm_callback() fails
364 */ 293 */
365#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 294#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
366 295
367int usermodehelper_read_trylock(void)
368{
369 DEFINE_WAIT(wait);
370 int ret = 0;
371
372 down_read(&umhelper_sem);
373 for (;;) {
374 prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
375 TASK_INTERRUPTIBLE);
376 if (!usermodehelper_disabled)
377 break;
378
379 if (usermodehelper_disabled == UMH_DISABLED)
380 ret = -EAGAIN;
381
382 up_read(&umhelper_sem);
383
384 if (ret)
385 break;
386
387 schedule();
388 try_to_freeze();
389
390 down_read(&umhelper_sem);
391 }
392 finish_wait(&usermodehelper_disabled_waitq, &wait);
393 return ret;
394}
395EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
396
397long usermodehelper_read_lock_wait(long timeout)
398{
399 DEFINE_WAIT(wait);
400
401 if (timeout < 0)
402 return -EINVAL;
403
404 down_read(&umhelper_sem);
405 for (;;) {
406 prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
407 TASK_UNINTERRUPTIBLE);
408 if (!usermodehelper_disabled)
409 break;
410
411 up_read(&umhelper_sem);
412
413 timeout = schedule_timeout(timeout);
414 if (!timeout)
415 break;
416
417 down_read(&umhelper_sem);
418 }
419 finish_wait(&usermodehelper_disabled_waitq, &wait);
420 return timeout;
421}
422EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
423
424void usermodehelper_read_unlock(void)
425{
426 up_read(&umhelper_sem);
427}
428EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
429
430/** 296/**
431 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. 297 * usermodehelper_disable - prevent new helpers from being started
432 * @depth: New value to assign to usermodehelper_disabled.
433 *
434 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
435 * writing) and wakeup tasks waiting for it to change.
436 */ 298 */
437void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) 299int usermodehelper_disable(void)
438{
439 down_write(&umhelper_sem);
440 usermodehelper_disabled = depth;
441 wake_up(&usermodehelper_disabled_waitq);
442 up_write(&umhelper_sem);
443}
444
445/**
446 * __usermodehelper_disable - Prevent new helpers from being started.
447 * @depth: New value to assign to usermodehelper_disabled.
448 *
449 * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
450 */
451int __usermodehelper_disable(enum umh_disable_depth depth)
452{ 300{
453 long retval; 301 long retval;
454 302
455 if (!depth) 303 usermodehelper_disabled = 1;
456 return -EINVAL; 304 smp_mb();
457
458 down_write(&umhelper_sem);
459 usermodehelper_disabled = depth;
460 up_write(&umhelper_sem);
461
462 /* 305 /*
463 * From now on call_usermodehelper_exec() won't start any new 306 * From now on call_usermodehelper_exec() won't start any new
464 * helpers, so it is sufficient if running_helpers turns out to 307 * helpers, so it is sufficient if running_helpers turns out to
@@ -471,10 +314,27 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
471 if (retval) 314 if (retval)
472 return 0; 315 return 0;
473 316
474 __usermodehelper_set_disable_depth(UMH_ENABLED); 317 usermodehelper_disabled = 0;
475 return -EAGAIN; 318 return -EAGAIN;
476} 319}
477 320
321/**
322 * usermodehelper_enable - allow new helpers to be started again
323 */
324void usermodehelper_enable(void)
325{
326 usermodehelper_disabled = 0;
327}
328
329/**
330 * usermodehelper_is_disabled - check if new helpers are allowed to be started
331 */
332bool usermodehelper_is_disabled(void)
333{
334 return usermodehelper_disabled;
335}
336EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
337
478static void helper_lock(void) 338static void helper_lock(void)
479{ 339{
480 atomic_inc(&running_helpers); 340 atomic_inc(&running_helpers);
@@ -498,7 +358,6 @@ static void helper_unlock(void)
498 * structure. This should be passed to call_usermodehelper_exec to 358 * structure. This should be passed to call_usermodehelper_exec to
499 * exec the process and free the structure. 359 * exec the process and free the structure.
500 */ 360 */
501static
502struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 361struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
503 char **envp, gfp_t gfp_mask) 362 char **envp, gfp_t gfp_mask)
504{ 363{
@@ -514,6 +373,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
514 out: 373 out:
515 return sub_info; 374 return sub_info;
516} 375}
376EXPORT_SYMBOL(call_usermodehelper_setup);
517 377
518/** 378/**
519 * call_usermodehelper_setfns - set a cleanup/init function 379 * call_usermodehelper_setfns - set a cleanup/init function
@@ -531,7 +391,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
531 * Function must be runnable in either a process context or the 391 * Function must be runnable in either a process context or the
532 * context in which call_usermodehelper_exec is called. 392 * context in which call_usermodehelper_exec is called.
533 */ 393 */
534static
535void call_usermodehelper_setfns(struct subprocess_info *info, 394void call_usermodehelper_setfns(struct subprocess_info *info,
536 int (*init)(struct subprocess_info *info, struct cred *new), 395 int (*init)(struct subprocess_info *info, struct cred *new),
537 void (*cleanup)(struct subprocess_info *info), 396 void (*cleanup)(struct subprocess_info *info),
@@ -541,6 +400,7 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
541 info->init = init; 400 info->init = init;
542 info->data = data; 401 info->data = data;
543} 402}
403EXPORT_SYMBOL(call_usermodehelper_setfns);
544 404
545/** 405/**
546 * call_usermodehelper_exec - start a usermode application 406 * call_usermodehelper_exec - start a usermode application
@@ -554,8 +414,8 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
554 * asynchronously if wait is not set, and runs as a child of keventd. 414 * asynchronously if wait is not set, and runs as a child of keventd.
555 * (ie. it runs with full root capabilities). 415 * (ie. it runs with full root capabilities).
556 */ 416 */
557static 417int call_usermodehelper_exec(struct subprocess_info *sub_info,
558int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 418 enum umh_wait wait)
559{ 419{
560 DECLARE_COMPLETION_ONSTACK(done); 420 DECLARE_COMPLETION_ONSTACK(done);
561 int retval = 0; 421 int retval = 0;
@@ -568,16 +428,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
568 retval = -EBUSY; 428 retval = -EBUSY;
569 goto out; 429 goto out;
570 } 430 }
571 /*
572 * Worker thread must not wait for khelper thread at below
573 * wait_for_completion() if the thread was created with CLONE_VFORK
574 * flag, for khelper thread is already waiting for the thread at
575 * wait_for_completion() in do_fork().
576 */
577 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
578 retval = -EBUSY;
579 goto out;
580 }
581 431
582 sub_info->complete = &done; 432 sub_info->complete = &done;
583 sub_info->wait = wait; 433 sub_info->wait = wait;
@@ -585,52 +435,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
585 queue_work(khelper_wq, &sub_info->work); 435 queue_work(khelper_wq, &sub_info->work);
586 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 436 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
587 goto unlock; 437 goto unlock;
588
589 if (wait & UMH_KILLABLE) {
590 retval = wait_for_completion_killable(&done);
591 if (!retval)
592 goto wait_done;
593
594 /* umh_complete() will see NULL and free sub_info */
595 if (xchg(&sub_info->complete, NULL))
596 goto unlock;
597 /* fallthrough, umh_complete() was already called */
598 }
599
600 wait_for_completion(&done); 438 wait_for_completion(&done);
601wait_done:
602 retval = sub_info->retval; 439 retval = sub_info->retval;
440
603out: 441out:
604 call_usermodehelper_freeinfo(sub_info); 442 call_usermodehelper_freeinfo(sub_info);
605unlock: 443unlock:
606 helper_unlock(); 444 helper_unlock();
607 return retval; 445 return retval;
608} 446}
609 447EXPORT_SYMBOL(call_usermodehelper_exec);
610/*
611 * call_usermodehelper_fns() will not run the caller-provided cleanup function
612 * if a memory allocation failure is experienced. So the caller might need to
613 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
614 * the necessaary cleanup within the caller.
615 */
616int call_usermodehelper_fns(
617 char *path, char **argv, char **envp, int wait,
618 int (*init)(struct subprocess_info *info, struct cred *new),
619 void (*cleanup)(struct subprocess_info *), void *data)
620{
621 struct subprocess_info *info;
622 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
623
624 info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
625
626 if (info == NULL)
627 return -ENOMEM;
628
629 call_usermodehelper_setfns(info, init, cleanup, data);
630
631 return call_usermodehelper_exec(info, wait);
632}
633EXPORT_SYMBOL(call_usermodehelper_fns);
634 448
635static int proc_cap_handler(struct ctl_table *table, int write, 449static int proc_cap_handler(struct ctl_table *table, int write,
636 void __user *buffer, size_t *lenp, loff_t *ppos) 450 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa40..b30fd54eb98 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/stddef.h> 38#include <linux/stddef.h>
39#include <linux/export.h> 39#include <linux/module.h>
40#include <linux/moduleloader.h> 40#include <linux/moduleloader.h>
41#include <linux/kallsyms.h> 41#include <linux/kallsyms.h>
42#include <linux/freezer.h> 42#include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
78static DEFINE_MUTEX(kprobe_mutex); 78static DEFINE_MUTEX(kprobe_mutex);
79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
80static struct { 80static struct {
81 raw_spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 82} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 83
84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 84static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 85{
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
561{ 561{
562 LIST_HEAD(free_list); 562 LIST_HEAD(free_list);
563 563
564 mutex_lock(&kprobe_mutex);
565 /* Lock modules while optimizing kprobes */ 564 /* Lock modules while optimizing kprobes */
566 mutex_lock(&module_mutex); 565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567 567
568 /* 568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
586 /* Step 4: Free cleaned kprobes after quiesence period */ 586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list); 587 do_free_cleaned_kprobes(&free_list);
588 588
589 mutex_unlock(&module_mutex);
590 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
590 mutex_unlock(&module_mutex);
591 591
592 /* Step 5: Kick optimizer again if needed */ 592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
@@ -759,32 +759,20 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
759 struct kprobe *ap; 759 struct kprobe *ap;
760 struct optimized_kprobe *op; 760 struct optimized_kprobe *op;
761 761
762 /* Impossible to optimize ftrace-based kprobe */
763 if (kprobe_ftrace(p))
764 return;
765
766 /* For preparing optimization, jump_label_text_reserved() is called */
767 jump_label_lock();
768 mutex_lock(&text_mutex);
769
770 ap = alloc_aggr_kprobe(p); 762 ap = alloc_aggr_kprobe(p);
771 if (!ap) 763 if (!ap)
772 goto out; 764 return;
773 765
774 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
775 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
776 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
777 arch_remove_optimized_kprobe(op); 769 arch_remove_optimized_kprobe(op);
778 kfree(op); 770 kfree(op);
779 goto out; 771 return;
780 } 772 }
781 773
782 init_aggr_kprobe(ap, p); 774 init_aggr_kprobe(ap, p);
783 optimize_kprobe(ap); /* This just kicks optimizer thread */ 775 optimize_kprobe(ap);
784
785out:
786 mutex_unlock(&text_mutex);
787 jump_label_unlock();
788} 776}
789 777
790#ifdef CONFIG_SYSCTL 778#ifdef CONFIG_SYSCTL
@@ -919,64 +907,9 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 907}
920#endif /* CONFIG_OPTPROBES */ 908#endif /* CONFIG_OPTPROBES */
921 909
922#ifdef KPROBES_CAN_USE_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS,
926};
927static int kprobe_ftrace_enabled;
928
929/* Must ensure p->addr is really on ftrace */
930static int __kprobes prepare_kprobe(struct kprobe *p)
931{
932 if (!kprobe_ftrace(p))
933 return arch_prepare_kprobe(p);
934
935 return arch_prepare_kprobe_ftrace(p);
936}
937
938/* Caller must lock kprobe_mutex */
939static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
940{
941 int ret;
942
943 ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
944 (unsigned long)p->addr, 0, 0);
945 WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
946 kprobe_ftrace_enabled++;
947 if (kprobe_ftrace_enabled == 1) {
948 ret = register_ftrace_function(&kprobe_ftrace_ops);
949 WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
950 }
951}
952
953/* Caller must lock kprobe_mutex */
954static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
955{
956 int ret;
957
958 kprobe_ftrace_enabled--;
959 if (kprobe_ftrace_enabled == 0) {
960 ret = unregister_ftrace_function(&kprobe_ftrace_ops);
961 WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
962 }
963 ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
964 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966}
967#else /* !KPROBES_CAN_USE_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0)
971#endif
972
973/* Arm a kprobe with text_mutex */ 910/* Arm a kprobe with text_mutex */
974static void __kprobes arm_kprobe(struct kprobe *kp) 911static void __kprobes arm_kprobe(struct kprobe *kp)
975{ 912{
976 if (unlikely(kprobe_ftrace(kp))) {
977 arm_kprobe_ftrace(kp);
978 return;
979 }
980 /* 913 /*
981 * Here, since __arm_kprobe() doesn't use stop_machine(), 914 * Here, since __arm_kprobe() doesn't use stop_machine(),
982 * this doesn't cause deadlock on text_mutex. So, we don't 915 * this doesn't cause deadlock on text_mutex. So, we don't
@@ -988,15 +921,11 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
988} 921}
989 922
990/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
991static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
992{ 925{
993 if (unlikely(kprobe_ftrace(kp))) {
994 disarm_kprobe_ftrace(kp);
995 return;
996 }
997 /* Ditto */ 926 /* Ditto */
998 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
999 __disarm_kprobe(kp, reopt); 928 __disarm_kprobe(kp, true);
1000 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
1001} 930}
1002 931
@@ -1084,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1084 hlist_del(&ri->hlist); 1013 hlist_del(&ri->hlist);
1085 INIT_HLIST_NODE(&ri->hlist); 1014 INIT_HLIST_NODE(&ri->hlist);
1086 if (likely(rp)) { 1015 if (likely(rp)) {
1087 raw_spin_lock(&rp->lock); 1016 spin_lock(&rp->lock);
1088 hlist_add_head(&ri->hlist, &rp->free_instances); 1017 hlist_add_head(&ri->hlist, &rp->free_instances);
1089 raw_spin_unlock(&rp->lock); 1018 spin_unlock(&rp->lock);
1090 } else 1019 } else
1091 /* Unregistering */ 1020 /* Unregistering */
1092 hlist_add_head(&ri->hlist, head); 1021 hlist_add_head(&ri->hlist, head);
@@ -1097,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
1097__acquires(hlist_lock) 1026__acquires(hlist_lock)
1098{ 1027{
1099 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1100 raw_spinlock_t *hlist_lock; 1029 spinlock_t *hlist_lock;
1101 1030
1102 *head = &kretprobe_inst_table[hash]; 1031 *head = &kretprobe_inst_table[hash];
1103 hlist_lock = kretprobe_table_lock_ptr(hash); 1032 hlist_lock = kretprobe_table_lock_ptr(hash);
1104 raw_spin_lock_irqsave(hlist_lock, *flags); 1033 spin_lock_irqsave(hlist_lock, *flags);
1105} 1034}
1106 1035
1107static void __kprobes kretprobe_table_lock(unsigned long hash, 1036static void __kprobes kretprobe_table_lock(unsigned long hash,
1108 unsigned long *flags) 1037 unsigned long *flags)
1109__acquires(hlist_lock) 1038__acquires(hlist_lock)
1110{ 1039{
1111 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1040 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1112 raw_spin_lock_irqsave(hlist_lock, *flags); 1041 spin_lock_irqsave(hlist_lock, *flags);
1113} 1042}
1114 1043
1115void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1117,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
1117__releases(hlist_lock) 1046__releases(hlist_lock)
1118{ 1047{
1119 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1120 raw_spinlock_t *hlist_lock; 1049 spinlock_t *hlist_lock;
1121 1050
1122 hlist_lock = kretprobe_table_lock_ptr(hash); 1051 hlist_lock = kretprobe_table_lock_ptr(hash);
1123 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1052 spin_unlock_irqrestore(hlist_lock, *flags);
1124} 1053}
1125 1054
1126static void __kprobes kretprobe_table_unlock(unsigned long hash, 1055static void __kprobes kretprobe_table_unlock(unsigned long hash,
1127 unsigned long *flags) 1056 unsigned long *flags)
1128__releases(hlist_lock) 1057__releases(hlist_lock)
1129{ 1058{
1130 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1059 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1131 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1060 spin_unlock_irqrestore(hlist_lock, *flags);
1132} 1061}
1133 1062
1134/* 1063/*
@@ -1148,7 +1077,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1148 /* Early boot. kretprobe_table_locks not yet initialized. */ 1077 /* Early boot. kretprobe_table_locks not yet initialized. */
1149 return; 1078 return;
1150 1079
1151 INIT_HLIST_HEAD(&empty_rp);
1152 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1080 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1153 head = &kretprobe_inst_table[hash]; 1081 head = &kretprobe_inst_table[hash];
1154 kretprobe_table_lock(hash, &flags); 1082 kretprobe_table_lock(hash, &flags);
@@ -1157,6 +1085,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1157 recycle_rp_inst(ri, &empty_rp); 1085 recycle_rp_inst(ri, &empty_rp);
1158 } 1086 }
1159 kretprobe_table_unlock(hash, &flags); 1087 kretprobe_table_unlock(hash, &flags);
1088 INIT_HLIST_HEAD(&empty_rp);
1160 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
1161 hlist_del(&ri->hlist); 1090 hlist_del(&ri->hlist);
1162 kfree(ri); 1091 kfree(ri);
@@ -1215,6 +1144,12 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1215 if (p->post_handler && !ap->post_handler) 1144 if (p->post_handler && !ap->post_handler)
1216 ap->post_handler = aggr_post_handler; 1145 ap->post_handler = aggr_post_handler;
1217 1146
1147 if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
1148 ap->flags &= ~KPROBE_FLAG_DISABLED;
1149 if (!kprobes_all_disarmed)
1150 /* Arm the breakpoint again. */
1151 __arm_kprobe(ap);
1152 }
1218 return 0; 1153 return 0;
1219} 1154}
1220 1155
@@ -1254,22 +1189,11 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1254 int ret = 0; 1189 int ret = 0;
1255 struct kprobe *ap = orig_p; 1190 struct kprobe *ap = orig_p;
1256 1191
1257 /* For preparing optimization, jump_label_text_reserved() is called */
1258 jump_label_lock();
1259 /*
1260 * Get online CPUs to avoid text_mutex deadlock.with stop machine,
1261 * which is invoked by unoptimize_kprobe() in add_new_kprobe()
1262 */
1263 get_online_cpus();
1264 mutex_lock(&text_mutex);
1265
1266 if (!kprobe_aggrprobe(orig_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1267 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1268 ap = alloc_aggr_kprobe(orig_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1269 if (!ap) { 1195 if (!ap)
1270 ret = -ENOMEM; 1196 return -ENOMEM;
1271 goto out;
1272 }
1273 init_aggr_kprobe(ap, orig_p); 1197 init_aggr_kprobe(ap, orig_p);
1274 } else if (kprobe_unused(ap)) 1198 } else if (kprobe_unused(ap))
1275 /* This probe is going to die. Rescue it */ 1199 /* This probe is going to die. Rescue it */
@@ -1289,7 +1213,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1289 * free aggr_probe. It will be used next time, or 1213 * free aggr_probe. It will be used next time, or
1290 * freed by unregister_kprobe. 1214 * freed by unregister_kprobe.
1291 */ 1215 */
1292 goto out; 1216 return ret;
1293 1217
1294 /* Prepare optimized instructions if possible. */ 1218 /* Prepare optimized instructions if possible. */
1295 prepare_optimized_kprobe(ap); 1219 prepare_optimized_kprobe(ap);
@@ -1304,20 +1228,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
1304 1228
1305 /* Copy ap's insn slot to p */ 1229 /* Copy ap's insn slot to p */
1306 copy_kprobe(ap, p); 1230 copy_kprobe(ap, p);
1307 ret = add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1308
1309out:
1310 mutex_unlock(&text_mutex);
1311 put_online_cpus();
1312 jump_label_unlock();
1313
1314 if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
1315 ap->flags &= ~KPROBE_FLAG_DISABLED;
1316 if (!kprobes_all_disarmed)
1317 /* Arm the breakpoint again. */
1318 arm_kprobe(ap);
1319 }
1320 return ret;
1321} 1232}
1322 1233
1323static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -1402,96 +1313,69 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1402 return ret; 1313 return ret;
1403} 1314}
1404 1315
1405static __kprobes int check_kprobe_address_safe(struct kprobe *p, 1316int __kprobes register_kprobe(struct kprobe *p)
1406 struct module **probed_mod)
1407{ 1317{
1408 int ret = 0; 1318 int ret = 0;
1409 unsigned long ftrace_addr; 1319 struct kprobe *old_p;
1320 struct module *probed_mod;
1321 kprobe_opcode_t *addr;
1410 1322
1411 /* 1323 addr = kprobe_addr(p);
1412 * If the address is located on a ftrace nop, set the 1324 if (IS_ERR(addr))
1413 * breakpoint to the following instruction. 1325 return PTR_ERR(addr);
1414 */ 1326 p->addr = addr;
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1327
1416 if (ftrace_addr) { 1328 ret = check_kprobe_rereg(p);
1417#ifdef KPROBES_CAN_USE_FTRACE 1329 if (ret)
1418 /* Given address is not on the instruction boundary */ 1330 return ret;
1419 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */
1423 return -EINVAL;
1424#endif
1425 }
1426 1331
1427 jump_label_lock(); 1332 jump_label_lock();
1428 preempt_disable(); 1333 preempt_disable();
1429
1430 /* Ensure it is not in reserved area nor out of text */
1431 if (!kernel_text_address((unsigned long) p->addr) || 1334 if (!kernel_text_address((unsigned long) p->addr) ||
1432 in_kprobes_functions((unsigned long) p->addr) || 1335 in_kprobes_functions((unsigned long) p->addr) ||
1433 jump_label_text_reserved(p->addr, p->addr)) { 1336 ftrace_text_reserved(p->addr, p->addr) ||
1434 ret = -EINVAL; 1337 jump_label_text_reserved(p->addr, p->addr))
1435 goto out; 1338 goto fail_with_jump_label;
1436 } 1339
1340 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1341 p->flags &= KPROBE_FLAG_DISABLED;
1437 1342
1438 /* Check if are we probing a module */ 1343 /*
1439 *probed_mod = __module_text_address((unsigned long) p->addr); 1344 * Check if are we probing a module.
1440 if (*probed_mod) { 1345 */
1346 probed_mod = __module_text_address((unsigned long) p->addr);
1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1441 /* 1350 /*
1442 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1443 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
1444 */ 1353 */
1445 if (unlikely(!try_module_get(*probed_mod))) { 1354 if (unlikely(!try_module_get(probed_mod)))
1446 ret = -ENOENT; 1355 goto fail_with_jump_label;
1447 goto out;
1448 }
1449 1356
1450 /* 1357 /*
1451 * If the module freed .init.text, we couldn't insert 1358 * If the module freed .init.text, we couldn't insert
1452 * kprobes in there. 1359 * kprobes in there.
1453 */ 1360 */
1454 if (within_module_init((unsigned long)p->addr, *probed_mod) && 1361 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1455 (*probed_mod)->state != MODULE_STATE_COMING) { 1362 probed_mod->state != MODULE_STATE_COMING) {
1456 module_put(*probed_mod); 1363 module_put(probed_mod);
1457 *probed_mod = NULL; 1364 goto fail_with_jump_label;
1458 ret = -ENOENT;
1459 } 1365 }
1366 /* ret will be updated by following code */
1460 } 1367 }
1461out:
1462 preempt_enable(); 1368 preempt_enable();
1463 jump_label_unlock(); 1369 jump_label_unlock();
1464 1370
1465 return ret;
1466}
1467
1468int __kprobes register_kprobe(struct kprobe *p)
1469{
1470 int ret;
1471 struct kprobe *old_p;
1472 struct module *probed_mod;
1473 kprobe_opcode_t *addr;
1474
1475 /* Adjust probe address from symbol */
1476 addr = kprobe_addr(p);
1477 if (IS_ERR(addr))
1478 return PTR_ERR(addr);
1479 p->addr = addr;
1480
1481 ret = check_kprobe_rereg(p);
1482 if (ret)
1483 return ret;
1484
1485 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1486 p->flags &= KPROBE_FLAG_DISABLED;
1487 p->nmissed = 0; 1371 p->nmissed = 0;
1488 INIT_LIST_HEAD(&p->list); 1372 INIT_LIST_HEAD(&p->list);
1373 mutex_lock(&kprobe_mutex);
1489 1374
1490 ret = check_kprobe_address_safe(p, &probed_mod); 1375 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1491 if (ret)
1492 return ret;
1493 1376
1494 mutex_lock(&kprobe_mutex); 1377 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1378 mutex_lock(&text_mutex);
1495 1379
1496 old_p = get_kprobe(p->addr); 1380 old_p = get_kprobe(p->addr);
1497 if (old_p) { 1381 if (old_p) {
@@ -1500,9 +1384,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1500 goto out; 1384 goto out;
1501 } 1385 }
1502 1386
1503 mutex_lock(&text_mutex); /* Avoiding text modification */ 1387 ret = arch_prepare_kprobe(p);
1504 ret = prepare_kprobe(p);
1505 mutex_unlock(&text_mutex);
1506 if (ret) 1388 if (ret)
1507 goto out; 1389 goto out;
1508 1390
@@ -1511,18 +1393,26 @@ int __kprobes register_kprobe(struct kprobe *p)
1511 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1393 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
1512 1394
1513 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1395 if (!kprobes_all_disarmed && !kprobe_disabled(p))
1514 arm_kprobe(p); 1396 __arm_kprobe(p);
1515 1397
1516 /* Try to optimize kprobe */ 1398 /* Try to optimize kprobe */
1517 try_to_optimize_kprobe(p); 1399 try_to_optimize_kprobe(p);
1518 1400
1519out: 1401out:
1402 mutex_unlock(&text_mutex);
1403 put_online_cpus();
1404 jump_label_unlock();
1520 mutex_unlock(&kprobe_mutex); 1405 mutex_unlock(&kprobe_mutex);
1521 1406
1522 if (probed_mod) 1407 if (probed_mod)
1523 module_put(probed_mod); 1408 module_put(probed_mod);
1524 1409
1525 return ret; 1410 return ret;
1411
1412fail_with_jump_label:
1413 preempt_enable();
1414 jump_label_unlock();
1415 return ret;
1526} 1416}
1527EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1528 1418
@@ -1559,7 +1449,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1559 1449
1560 /* Try to disarm and disable this/parent probe */ 1450 /* Try to disarm and disable this/parent probe */
1561 if (p == orig_p || aggr_kprobe_disabled(orig_p)) { 1451 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1562 disarm_kprobe(orig_p, true); 1452 disarm_kprobe(orig_p);
1563 orig_p->flags |= KPROBE_FLAG_DISABLED; 1453 orig_p->flags |= KPROBE_FLAG_DISABLED;
1564 } 1454 }
1565 } 1455 }
@@ -1773,22 +1663,18 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1773 1663
1774 /*TODO: consider to only swap the RA after the last pre_handler fired */ 1664 /*TODO: consider to only swap the RA after the last pre_handler fired */
1775 hash = hash_ptr(current, KPROBE_HASH_BITS); 1665 hash = hash_ptr(current, KPROBE_HASH_BITS);
1776 raw_spin_lock_irqsave(&rp->lock, flags); 1666 spin_lock_irqsave(&rp->lock, flags);
1777 if (!hlist_empty(&rp->free_instances)) { 1667 if (!hlist_empty(&rp->free_instances)) {
1778 ri = hlist_entry(rp->free_instances.first, 1668 ri = hlist_entry(rp->free_instances.first,
1779 struct kretprobe_instance, hlist); 1669 struct kretprobe_instance, hlist);
1780 hlist_del(&ri->hlist); 1670 hlist_del(&ri->hlist);
1781 raw_spin_unlock_irqrestore(&rp->lock, flags); 1671 spin_unlock_irqrestore(&rp->lock, flags);
1782 1672
1783 ri->rp = rp; 1673 ri->rp = rp;
1784 ri->task = current; 1674 ri->task = current;
1785 1675
1786 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 1676 if (rp->entry_handler && rp->entry_handler(ri, regs))
1787 raw_spin_lock_irqsave(&rp->lock, flags);
1788 hlist_add_head(&ri->hlist, &rp->free_instances);
1789 raw_spin_unlock_irqrestore(&rp->lock, flags);
1790 return 0; 1677 return 0;
1791 }
1792 1678
1793 arch_prepare_kretprobe(ri, regs); 1679 arch_prepare_kretprobe(ri, regs);
1794 1680
@@ -1799,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1799 kretprobe_table_unlock(hash, &flags); 1685 kretprobe_table_unlock(hash, &flags);
1800 } else { 1686 } else {
1801 rp->nmissed++; 1687 rp->nmissed++;
1802 raw_spin_unlock_irqrestore(&rp->lock, flags); 1688 spin_unlock_irqrestore(&rp->lock, flags);
1803 } 1689 }
1804 return 0; 1690 return 0;
1805} 1691}
@@ -1835,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1835 rp->maxactive = num_possible_cpus(); 1721 rp->maxactive = num_possible_cpus();
1836#endif 1722#endif
1837 } 1723 }
1838 raw_spin_lock_init(&rp->lock); 1724 spin_lock_init(&rp->lock);
1839 INIT_HLIST_HEAD(&rp->free_instances); 1725 INIT_HLIST_HEAD(&rp->free_instances);
1840 for (i = 0; i < rp->maxactive; i++) { 1726 for (i = 0; i < rp->maxactive; i++) {
1841 inst = kmalloc(sizeof(struct kretprobe_instance) + 1727 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -2073,7 +1959,7 @@ static int __init init_kprobes(void)
2073 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1959 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2074 INIT_HLIST_HEAD(&kprobe_table[i]); 1960 INIT_HLIST_HEAD(&kprobe_table[i]);
2075 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1961 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
2076 raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); 1962 spin_lock_init(&(kretprobe_table_locks[i].lock));
2077 } 1963 }
2078 1964
2079 /* 1965 /*
@@ -2157,11 +2043,10 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
2157 2043
2158 if (!pp) 2044 if (!pp)
2159 pp = p; 2045 pp = p;
2160 seq_printf(pi, "%s%s%s%s\n", 2046 seq_printf(pi, "%s%s%s\n",
2161 (kprobe_gone(p) ? "[GONE]" : ""), 2047 (kprobe_gone(p) ? "[GONE]" : ""),
2162 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), 2048 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
2163 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), 2049 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
2164 (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
2165} 2050}
2166 2051
2167static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 2052static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -2240,12 +2125,14 @@ static void __kprobes arm_all_kprobes(void)
2240 goto already_enabled; 2125 goto already_enabled;
2241 2126
2242 /* Arming kprobes doesn't optimize kprobe itself */ 2127 /* Arming kprobes doesn't optimize kprobe itself */
2128 mutex_lock(&text_mutex);
2243 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2129 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2244 head = &kprobe_table[i]; 2130 head = &kprobe_table[i];
2245 hlist_for_each_entry_rcu(p, node, head, hlist) 2131 hlist_for_each_entry_rcu(p, node, head, hlist)
2246 if (!kprobe_disabled(p)) 2132 if (!kprobe_disabled(p))
2247 arm_kprobe(p); 2133 __arm_kprobe(p);
2248 } 2134 }
2135 mutex_unlock(&text_mutex);
2249 2136
2250 kprobes_all_disarmed = false; 2137 kprobes_all_disarmed = false;
2251 printk(KERN_INFO "Kprobes globally enabled\n"); 2138 printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2273,13 +2160,15 @@ static void __kprobes disarm_all_kprobes(void)
2273 kprobes_all_disarmed = true; 2160 kprobes_all_disarmed = true;
2274 printk(KERN_INFO "Kprobes globally disabled\n"); 2161 printk(KERN_INFO "Kprobes globally disabled\n");
2275 2162
2163 mutex_lock(&text_mutex);
2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2164 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2277 head = &kprobe_table[i]; 2165 head = &kprobe_table[i];
2278 hlist_for_each_entry_rcu(p, node, head, hlist) { 2166 hlist_for_each_entry_rcu(p, node, head, hlist) {
2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2167 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2280 disarm_kprobe(p, false); 2168 __disarm_kprobe(p, false);
2281 } 2169 }
2282 } 2170 }
2171 mutex_unlock(&text_mutex);
2283 mutex_unlock(&kprobe_mutex); 2172 mutex_unlock(&kprobe_mutex);
2284 2173
2285 /* Wait for disarming all kprobes by optimizer */ 2174 /* Wait for disarming all kprobes by optimizer */
@@ -2309,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2309 const char __user *user_buf, size_t count, loff_t *ppos) 2198 const char __user *user_buf, size_t count, loff_t *ppos)
2310{ 2199{
2311 char buf[32]; 2200 char buf[32];
2312 size_t buf_size; 2201 int buf_size;
2313 2202
2314 buf_size = min(count, (sizeof(buf)-1)); 2203 buf_size = min(count, (sizeof(buf)-1));
2315 if (copy_from_user(buf, user_buf, buf_size)) 2204 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9..3b053c04dd8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,11 +11,10 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/sysfs.h> 13#include <linux/sysfs.h>
14#include <linux/export.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/stat.h>
19#include <linux/sched.h> 18#include <linux/sched.h>
20#include <linux/capability.h> 19#include <linux/capability.h>
21 20
@@ -26,6 +25,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
26static struct kobj_attribute _name##_attr = \ 25static struct kobj_attribute _name##_attr = \
27 __ATTR(_name, 0644, _name##_show, _name##_store) 26 __ATTR(_name, 0644, _name##_show, _name##_store)
28 27
28#if defined(CONFIG_HOTPLUG)
29/* current uevent sequence number */ 29/* current uevent sequence number */
30static ssize_t uevent_seqnum_show(struct kobject *kobj, 30static ssize_t uevent_seqnum_show(struct kobject *kobj,
31 struct kobj_attribute *attr, char *buf) 31 struct kobj_attribute *attr, char *buf)
@@ -53,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53 return count; 53 return count;
54} 54}
55KERNEL_ATTR_RW(uevent_helper); 55KERNEL_ATTR_RW(uevent_helper);
56 56#endif
57 57
58#ifdef CONFIG_PROFILING 58#ifdef CONFIG_PROFILING
59static ssize_t profiling_show(struct kobject *kobj, 59static ssize_t profiling_show(struct kobject *kobj,
@@ -140,23 +140,6 @@ static ssize_t fscaps_show(struct kobject *kobj,
140} 140}
141KERNEL_ATTR_RO(fscaps); 141KERNEL_ATTR_RO(fscaps);
142 142
143int rcu_expedited;
144static ssize_t rcu_expedited_show(struct kobject *kobj,
145 struct kobj_attribute *attr, char *buf)
146{
147 return sprintf(buf, "%d\n", rcu_expedited);
148}
149static ssize_t rcu_expedited_store(struct kobject *kobj,
150 struct kobj_attribute *attr,
151 const char *buf, size_t count)
152{
153 if (kstrtoint(buf, 0, &rcu_expedited))
154 return -EINVAL;
155
156 return count;
157}
158KERNEL_ATTR_RW(rcu_expedited);
159
160/* 143/*
161 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
162 */ 145 */
@@ -185,8 +168,10 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
185 168
186static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
187 &fscaps_attr.attr, 170 &fscaps_attr.attr,
171#if defined(CONFIG_HOTPLUG)
188 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
189 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
174#endif
190#ifdef CONFIG_PROFILING 175#ifdef CONFIG_PROFILING
191 &profiling_attr.attr, 176 &profiling_attr.attr,
192#endif 177#endif
@@ -196,7 +181,6 @@ static struct attribute * kernel_attrs[] = {
196 &kexec_crash_size_attr.attr, 181 &kexec_crash_size_attr.attr,
197 &vmcoreinfo_attr.attr, 182 &vmcoreinfo_attr.attr,
198#endif 183#endif
199 &rcu_expedited_attr.attr,
200 NULL 184 NULL
201}; 185};
202 186
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9ba..4ba7cccb499 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,11 +12,10 @@
12#include <linux/cpuset.h> 12#include <linux/cpuset.h>
13#include <linux/unistd.h> 13#include <linux/unistd.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/export.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h>
20#include <trace/events/sched.h> 19#include <trace/events/sched.h>
21 20
22static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -38,20 +37,11 @@ struct kthread_create_info
38}; 37};
39 38
40struct kthread { 39struct kthread {
41 unsigned long flags; 40 int should_stop;
42 unsigned int cpu;
43 void *data; 41 void *data;
44 struct completion parked;
45 struct completion exited; 42 struct completion exited;
46}; 43};
47 44
48enum KTHREAD_BITS {
49 KTHREAD_IS_PER_CPU = 0,
50 KTHREAD_SHOULD_STOP,
51 KTHREAD_SHOULD_PARK,
52 KTHREAD_IS_PARKED,
53};
54
55#define to_kthread(tsk) \ 45#define to_kthread(tsk) \
56 container_of((tsk)->vfork_done, struct kthread, exited) 46 container_of((tsk)->vfork_done, struct kthread, exited)
57 47
@@ -62,54 +52,13 @@ enum KTHREAD_BITS {
62 * and this will return true. You should then return, and your return 52 * and this will return true. You should then return, and your return
63 * value will be passed through to kthread_stop(). 53 * value will be passed through to kthread_stop().
64 */ 54 */
65bool kthread_should_stop(void) 55int kthread_should_stop(void)
66{ 56{
67 return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); 57 return to_kthread(current)->should_stop;
68} 58}
69EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
70 60
71/** 61/**
72 * kthread_should_park - should this kthread park now?
73 *
74 * When someone calls kthread_park() on your kthread, it will be woken
75 * and this will return true. You should then do the necessary
76 * cleanup and call kthread_parkme()
77 *
78 * Similar to kthread_should_stop(), but this keeps the thread alive
79 * and in a park position. kthread_unpark() "restarts" the thread and
80 * calls the thread function again.
81 */
82bool kthread_should_park(void)
83{
84 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
85}
86
87/**
88 * kthread_freezable_should_stop - should this freezable kthread return now?
89 * @was_frozen: optional out parameter, indicates whether %current was frozen
90 *
91 * kthread_should_stop() for freezable kthreads, which will enter
92 * refrigerator if necessary. This function is safe from kthread_stop() /
93 * freezer deadlock and freezable kthreads should use this function instead
94 * of calling try_to_freeze() directly.
95 */
96bool kthread_freezable_should_stop(bool *was_frozen)
97{
98 bool frozen = false;
99
100 might_sleep();
101
102 if (unlikely(freezing(current)))
103 frozen = __refrigerator(true);
104
105 if (was_frozen)
106 *was_frozen = frozen;
107
108 return kthread_should_stop();
109}
110EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
111
112/**
113 * kthread_data - return data value specified on kthread creation 62 * kthread_data - return data value specified on kthread creation
114 * @task: kthread task in question 63 * @task: kthread task in question
115 * 64 *
@@ -122,24 +71,6 @@ void *kthread_data(struct task_struct *task)
122 return to_kthread(task)->data; 71 return to_kthread(task)->data;
123} 72}
124 73
125static void __kthread_parkme(struct kthread *self)
126{
127 __set_current_state(TASK_INTERRUPTIBLE);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked);
131 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE);
133 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING);
136}
137
138void kthread_parkme(void)
139{
140 __kthread_parkme(to_kthread(current));
141}
142
143static int kthread(void *_create) 74static int kthread(void *_create)
144{ 75{
145 /* Copy data: it's on kthread's stack */ 76 /* Copy data: it's on kthread's stack */
@@ -149,10 +80,9 @@ static int kthread(void *_create)
149 struct kthread self; 80 struct kthread self;
150 int ret; 81 int ret;
151 82
152 self.flags = 0; 83 self.should_stop = 0;
153 self.data = data; 84 self.data = data;
154 init_completion(&self.exited); 85 init_completion(&self.exited);
155 init_completion(&self.parked);
156 current->vfork_done = &self.exited; 86 current->vfork_done = &self.exited;
157 87
158 /* OK, tell user we're spawned, wait for stop or wakeup */ 88 /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -162,11 +92,9 @@ static int kthread(void *_create)
162 schedule(); 92 schedule();
163 93
164 ret = -EINTR; 94 ret = -EINTR;
165 95 if (!self.should_stop)
166 if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
167 __kthread_parkme(&self);
168 ret = threadfn(data); 96 ret = threadfn(data);
169 } 97
170 /* we can't just return, we must preserve "self" on stack */ 98 /* we can't just return, we must preserve "self" on stack */
171 do_exit(ret); 99 do_exit(ret);
172} 100}
@@ -219,7 +147,8 @@ static void create_kthread(struct kthread_create_info *create)
219 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
220 */ 148 */
221struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
222 void *data, int node, 150 void *data,
151 int node,
223 const char namefmt[], 152 const char namefmt[],
224 ...) 153 ...)
225{ 154{
@@ -256,13 +185,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
256} 185}
257EXPORT_SYMBOL(kthread_create_on_node); 186EXPORT_SYMBOL(kthread_create_on_node);
258 187
259static void __kthread_bind(struct task_struct *p, unsigned int cpu)
260{
261 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND;
264}
265
266/** 188/**
267 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
268 * @p: thread created by kthread_create(). 190 * @p: thread created by kthread_create().
@@ -279,110 +201,12 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
279 WARN_ON(1); 201 WARN_ON(1);
280 return; 202 return;
281 } 203 }
282 __kthread_bind(p, cpu);
283}
284EXPORT_SYMBOL(kthread_bind);
285
286/**
287 * kthread_create_on_cpu - Create a cpu bound kthread
288 * @threadfn: the function to run until signal_pending(current).
289 * @data: data ptr for @threadfn.
290 * @cpu: The cpu on which the thread should be bound,
291 * @namefmt: printf-style name for the thread. Format is restricted
292 * to "name.*%u". Code fills in cpu number.
293 *
294 * Description: This helper function creates and names a kernel thread
295 * The thread will be woken and put into park mode.
296 */
297struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
298 void *data, unsigned int cpu,
299 const char *namefmt)
300{
301 struct task_struct *p;
302
303 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
304 cpu);
305 if (IS_ERR(p))
306 return p;
307 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
308 to_kthread(p)->cpu = cpu;
309 /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
310 kthread_park(p);
311 return p;
312}
313 204
314static struct kthread *task_get_live_kthread(struct task_struct *k) 205 /* It's safe because the task is inactive. */
315{ 206 do_set_cpus_allowed(p, cpumask_of(cpu));
316 struct kthread *kthread; 207 p->flags |= PF_THREAD_BOUND;
317
318 get_task_struct(k);
319 kthread = to_kthread(k);
320 /* It might have exited */
321 barrier();
322 if (k->vfork_done != NULL)
323 return kthread;
324 return NULL;
325}
326
327/**
328 * kthread_unpark - unpark a thread created by kthread_create().
329 * @k: thread created by kthread_create().
330 *
331 * Sets kthread_should_park() for @k to return false, wakes it, and
332 * waits for it to return. If the thread is marked percpu then its
333 * bound to the cpu again.
334 */
335void kthread_unpark(struct task_struct *k)
336{
337 struct kthread *kthread = task_get_live_kthread(k);
338
339 if (kthread) {
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k);
354}
355
356/**
357 * kthread_park - park a thread created by kthread_create().
358 * @k: thread created by kthread_create().
359 *
360 * Sets kthread_should_park() for @k to return true, wakes it, and
361 * waits for it to return. This can also be called after kthread_create()
362 * instead of calling wake_up_process(): the thread will park without
363 * calling threadfn().
364 *
365 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
366 * If called by the kthread itself just the park bit is set.
367 */
368int kthread_park(struct task_struct *k)
369{
370 struct kthread *kthread = task_get_live_kthread(k);
371 int ret = -ENOSYS;
372
373 if (kthread) {
374 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
375 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
376 if (k != current) {
377 wake_up_process(k);
378 wait_for_completion(&kthread->parked);
379 }
380 }
381 ret = 0;
382 }
383 put_task_struct(k);
384 return ret;
385} 208}
209EXPORT_SYMBOL(kthread_bind);
386 210
387/** 211/**
388 * kthread_stop - stop a thread created by kthread_create(). 212 * kthread_stop - stop a thread created by kthread_create().
@@ -401,13 +225,16 @@ int kthread_park(struct task_struct *k)
401 */ 225 */
402int kthread_stop(struct task_struct *k) 226int kthread_stop(struct task_struct *k)
403{ 227{
404 struct kthread *kthread = task_get_live_kthread(k); 228 struct kthread *kthread;
405 int ret; 229 int ret;
406 230
407 trace_sched_kthread_stop(k); 231 trace_sched_kthread_stop(k);
408 if (kthread) { 232 get_task_struct(k);
409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 233
410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 234 kthread = to_kthread(k);
235 barrier(); /* it might have exited */
236 if (k->vfork_done != NULL) {
237 kthread->should_stop = 1;
411 wake_up_process(k); 238 wake_up_process(k);
412 wait_for_completion(&kthread->exited); 239 wait_for_completion(&kthread->exited);
413 } 240 }
@@ -428,9 +255,9 @@ int kthreadd(void *unused)
428 set_task_comm(tsk, "kthreadd"); 255 set_task_comm(tsk, "kthreadd");
429 ignore_signals(tsk); 256 ignore_signals(tsk);
430 set_cpus_allowed_ptr(tsk, cpu_all_mask); 257 set_cpus_allowed_ptr(tsk, cpu_all_mask);
431 set_mems_allowed(node_states[N_MEMORY]); 258 set_mems_allowed(node_states[N_HIGH_MEMORY]);
432 259
433 current->flags |= PF_NOFREEZE; 260 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
434 261
435 for (;;) { 262 for (;;) {
436 set_current_state(TASK_INTERRUPTIBLE); 263 set_current_state(TASK_INTERRUPTIBLE);
@@ -508,12 +335,16 @@ repeat:
508 struct kthread_work, node); 335 struct kthread_work, node);
509 list_del_init(&work->node); 336 list_del_init(&work->node);
510 } 337 }
511 worker->current_work = work;
512 spin_unlock_irq(&worker->lock); 338 spin_unlock_irq(&worker->lock);
513 339
514 if (work) { 340 if (work) {
515 __set_current_state(TASK_RUNNING); 341 __set_current_state(TASK_RUNNING);
516 work->func(work); 342 work->func(work);
343 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
344 work->done_seq = work->queue_seq;
345 smp_mb(); /* mb worker-b1 paired with flush-b0 */
346 if (atomic_read(&work->flushing))
347 wake_up_all(&work->done);
517 } else if (!freezing(current)) 348 } else if (!freezing(current))
518 schedule(); 349 schedule();
519 350
@@ -522,19 +353,6 @@ repeat:
522} 353}
523EXPORT_SYMBOL_GPL(kthread_worker_fn); 354EXPORT_SYMBOL_GPL(kthread_worker_fn);
524 355
525/* insert @work before @pos in @worker */
526static void insert_kthread_work(struct kthread_worker *worker,
527 struct kthread_work *work,
528 struct list_head *pos)
529{
530 lockdep_assert_held(&worker->lock);
531
532 list_add_tail(&work->node, pos);
533 work->worker = worker;
534 if (likely(worker->task))
535 wake_up_process(worker->task);
536}
537
538/** 356/**
539 * queue_kthread_work - queue a kthread_work 357 * queue_kthread_work - queue a kthread_work
540 * @worker: target kthread_worker 358 * @worker: target kthread_worker
@@ -552,7 +370,10 @@ bool queue_kthread_work(struct kthread_worker *worker,
552 370
553 spin_lock_irqsave(&worker->lock, flags); 371 spin_lock_irqsave(&worker->lock, flags);
554 if (list_empty(&work->node)) { 372 if (list_empty(&work->node)) {
555 insert_kthread_work(worker, work, &worker->work_list); 373 list_add_tail(&work->node, &worker->work_list);
374 work->queue_seq++;
375 if (likely(worker->task))
376 wake_up_process(worker->task);
556 ret = true; 377 ret = true;
557 } 378 }
558 spin_unlock_irqrestore(&worker->lock, flags); 379 spin_unlock_irqrestore(&worker->lock, flags);
@@ -560,18 +381,6 @@ bool queue_kthread_work(struct kthread_worker *worker,
560} 381}
561EXPORT_SYMBOL_GPL(queue_kthread_work); 382EXPORT_SYMBOL_GPL(queue_kthread_work);
562 383
563struct kthread_flush_work {
564 struct kthread_work work;
565 struct completion done;
566};
567
568static void kthread_flush_work_fn(struct kthread_work *work)
569{
570 struct kthread_flush_work *fwork =
571 container_of(work, struct kthread_flush_work, work);
572 complete(&fwork->done);
573}
574
575/** 384/**
576 * flush_kthread_work - flush a kthread_work 385 * flush_kthread_work - flush a kthread_work
577 * @work: work to flush 386 * @work: work to flush
@@ -580,38 +389,40 @@ static void kthread_flush_work_fn(struct kthread_work *work)
580 */ 389 */
581void flush_kthread_work(struct kthread_work *work) 390void flush_kthread_work(struct kthread_work *work)
582{ 391{
583 struct kthread_flush_work fwork = { 392 int seq = work->queue_seq;
584 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
585 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
586 };
587 struct kthread_worker *worker;
588 bool noop = false;
589 393
590retry: 394 atomic_inc(&work->flushing);
591 worker = work->worker;
592 if (!worker)
593 return;
594
595 spin_lock_irq(&worker->lock);
596 if (work->worker != worker) {
597 spin_unlock_irq(&worker->lock);
598 goto retry;
599 }
600 395
601 if (!list_empty(&work->node)) 396 /*
602 insert_kthread_work(worker, &fwork.work, work->node.next); 397 * mb flush-b0 paired with worker-b1, to make sure either
603 else if (worker->current_work == work) 398 * worker sees the above increment or we see done_seq update.
604 insert_kthread_work(worker, &fwork.work, worker->work_list.next); 399 */
605 else 400 smp_mb__after_atomic_inc();
606 noop = true;
607 401
608 spin_unlock_irq(&worker->lock); 402 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
403 wait_event(work->done, seq - work->done_seq <= 0);
404 atomic_dec(&work->flushing);
609 405
610 if (!noop) 406 /*
611 wait_for_completion(&fwork.done); 407 * rmb flush-b1 paired with worker-b0, to make sure our caller
408 * sees every change made by work->func().
409 */
410 smp_mb__after_atomic_dec();
612} 411}
613EXPORT_SYMBOL_GPL(flush_kthread_work); 412EXPORT_SYMBOL_GPL(flush_kthread_work);
614 413
414struct kthread_flush_work {
415 struct kthread_work work;
416 struct completion done;
417};
418
419static void kthread_flush_work_fn(struct kthread_work *work)
420{
421 struct kthread_flush_work *fwork =
422 container_of(work, struct kthread_flush_work, work);
423 complete(&fwork->done);
424}
425
615/** 426/**
616 * flush_kthread_worker - flush all current works on a kthread_worker 427 * flush_kthread_worker - flush all current works on a kthread_worker
617 * @worker: worker to flush 428 * @worker: worker to flush
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a..376066e1041 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
53#include <linux/notifier.h> 53#include <linux/notifier.h>
54#include <linux/spinlock.h> 54#include <linux/spinlock.h>
55#include <linux/proc_fs.h> 55#include <linux/proc_fs.h>
56#include <linux/export.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
60 60
61static DEFINE_RAW_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
62 62
63#define MAXLR 128 63#define MAXLR 128
64static struct latency_record latency_record[MAXLR]; 64static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
72 if (!latencytop_enabled) 72 if (!latencytop_enabled)
73 return; 73 return;
74 74
75 raw_spin_lock_irqsave(&latency_lock, flags); 75 spin_lock_irqsave(&latency_lock, flags);
76 memset(&p->latency_record, 0, sizeof(p->latency_record)); 76 memset(&p->latency_record, 0, sizeof(p->latency_record));
77 p->latency_record_count = 0; 77 p->latency_record_count = 0;
78 raw_spin_unlock_irqrestore(&latency_lock, flags); 78 spin_unlock_irqrestore(&latency_lock, flags);
79} 79}
80 80
81static void clear_global_latency_tracing(void) 81static void clear_global_latency_tracing(void)
82{ 82{
83 unsigned long flags; 83 unsigned long flags;
84 84
85 raw_spin_lock_irqsave(&latency_lock, flags); 85 spin_lock_irqsave(&latency_lock, flags);
86 memset(&latency_record, 0, sizeof(latency_record)); 86 memset(&latency_record, 0, sizeof(latency_record));
87 raw_spin_unlock_irqrestore(&latency_lock, flags); 87 spin_unlock_irqrestore(&latency_lock, flags);
88} 88}
89 89
90static void __sched 90static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
190 lat.max = usecs; 190 lat.max = usecs;
191 store_stacktrace(tsk, &lat); 191 store_stacktrace(tsk, &lat);
192 192
193 raw_spin_lock_irqsave(&latency_lock, flags); 193 spin_lock_irqsave(&latency_lock, flags);
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
232 232
233out_unlock: 233out_unlock:
234 raw_spin_unlock_irqrestore(&latency_lock, flags); 234 spin_unlock_irqrestore(&latency_lock, flags);
235} 235}
236 236
237static int lstats_show(struct seq_file *m, void *v) 237static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lglock.c b/kernel/lglock.c
deleted file mode 100644
index 6535a667a5a..00000000000
--- a/kernel/lglock.c
+++ /dev/null
@@ -1,89 +0,0 @@
1/* See include/linux/lglock.h for description */
2#include <linux/module.h>
3#include <linux/lglock.h>
4#include <linux/cpu.h>
5#include <linux/string.h>
6
7/*
8 * Note there is no uninit, so lglocks cannot be defined in
9 * modules (but it's fine to use them from there)
10 * Could be added though, just undo lg_lock_init
11 */
12
13void lg_lock_init(struct lglock *lg, char *name)
14{
15 LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
16}
17EXPORT_SYMBOL(lg_lock_init);
18
19void lg_local_lock(struct lglock *lg)
20{
21 arch_spinlock_t *lock;
22
23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock);
27}
28EXPORT_SYMBOL(lg_local_lock);
29
30void lg_local_unlock(struct lglock *lg)
31{
32 arch_spinlock_t *lock;
33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock);
37 preempt_enable();
38}
39EXPORT_SYMBOL(lg_local_unlock);
40
41void lg_local_lock_cpu(struct lglock *lg, int cpu)
42{
43 arch_spinlock_t *lock;
44
45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock);
49}
50EXPORT_SYMBOL(lg_local_lock_cpu);
51
52void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{
54 arch_spinlock_t *lock;
55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock);
59 preempt_enable();
60}
61EXPORT_SYMBOL(lg_local_unlock_cpu);
62
63void lg_global_lock(struct lglock *lg)
64{
65 int i;
66
67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i);
72 arch_spin_lock(lock);
73 }
74}
75EXPORT_SYMBOL(lg_global_lock);
76
77void lg_global_unlock(struct lglock *lg)
78{
79 int i;
80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i);
85 arch_spin_unlock(lock);
86 }
87 preempt_enable();
88}
89EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350..447960603fb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -97,13 +97,8 @@ static int graph_lock(void)
97 97
98static inline int graph_unlock(void) 98static inline int graph_unlock(void)
99{ 99{
100 if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { 100 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
101 /*
102 * The lockdep graph lock isn't locked while we expect it to
103 * be, we're confused now, bye!
104 */
105 return DEBUG_LOCKS_WARN_ON(1); 101 return DEBUG_LOCKS_WARN_ON(1);
106 }
107 102
108 current->lockdep_recursion--; 103 current->lockdep_recursion--;
109 arch_spin_unlock(&lockdep_lock); 104 arch_spin_unlock(&lockdep_lock);
@@ -140,9 +135,6 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
140static inline struct lock_class *hlock_class(struct held_lock *hlock) 135static inline struct lock_class *hlock_class(struct held_lock *hlock)
141{ 136{
142 if (!hlock->class_idx) { 137 if (!hlock->class_idx) {
143 /*
144 * Someone passed in garbage, we give up.
145 */
146 DEBUG_LOCKS_WARN_ON(1); 138 DEBUG_LOCKS_WARN_ON(1);
147 return NULL; 139 return NULL;
148 } 140 }
@@ -431,7 +423,6 @@ unsigned int max_lockdep_depth;
431 * about it later on, in lockdep_info(). 423 * about it later on, in lockdep_info().
432 */ 424 */
433static int lockdep_init_error; 425static int lockdep_init_error;
434static const char *lock_init_error;
435static unsigned long lockdep_init_trace_data[20]; 426static unsigned long lockdep_init_trace_data[20];
436static struct stack_trace lockdep_init_trace = { 427static struct stack_trace lockdep_init_trace = {
437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 428 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -500,32 +491,36 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
500 usage[i] = '\0'; 491 usage[i] = '\0';
501} 492}
502 493
503static void __print_lock_name(struct lock_class *class) 494static int __print_lock_name(struct lock_class *class)
504{ 495{
505 char str[KSYM_NAME_LEN]; 496 char str[KSYM_NAME_LEN];
506 const char *name; 497 const char *name;
507 498
508 name = class->name; 499 name = class->name;
509 if (!name) { 500 if (!name)
510 name = __get_key_name(class->key, str); 501 name = __get_key_name(class->key, str);
511 printk("%s", name); 502
512 } else { 503 return printk("%s", name);
513 printk("%s", name);
514 if (class->name_version > 1)
515 printk("#%d", class->name_version);
516 if (class->subclass)
517 printk("/%d", class->subclass);
518 }
519} 504}
520 505
521static void print_lock_name(struct lock_class *class) 506static void print_lock_name(struct lock_class *class)
522{ 507{
523 char usage[LOCK_USAGE_CHARS]; 508 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
509 const char *name;
524 510
525 get_usage_chars(class, usage); 511 get_usage_chars(class, usage);
526 512
527 printk(" ("); 513 name = class->name;
528 __print_lock_name(class); 514 if (!name) {
515 name = __get_key_name(class->key, str);
516 printk(" (%s", name);
517 } else {
518 printk(" (%s", name);
519 if (class->name_version > 1)
520 printk("#%d", class->name_version);
521 if (class->subclass)
522 printk("/%d", class->subclass);
523 }
529 printk("){%s}", usage); 524 printk("){%s}", usage);
530} 525}
531 526
@@ -565,12 +560,11 @@ static void lockdep_print_held_locks(struct task_struct *curr)
565 } 560 }
566} 561}
567 562
568static void print_kernel_ident(void) 563static void print_kernel_version(void)
569{ 564{
570 printk("%s %.*s %s\n", init_utsname()->release, 565 printk("%s %.*s\n", init_utsname()->release,
571 (int)strcspn(init_utsname()->version, " "), 566 (int)strcspn(init_utsname()->version, " "),
572 init_utsname()->version, 567 init_utsname()->version);
573 print_tainted());
574} 568}
575 569
576static int very_verbose(struct lock_class *class) 570static int very_verbose(struct lock_class *class)
@@ -654,7 +648,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
654 if (unlikely(!lockdep_initialized)) { 648 if (unlikely(!lockdep_initialized)) {
655 lockdep_init(); 649 lockdep_init();
656 lockdep_init_error = 1; 650 lockdep_init_error = 1;
657 lock_init_error = lock->name;
658 save_stack_trace(&lockdep_init_trace); 651 save_stack_trace(&lockdep_init_trace);
659 } 652 }
660#endif 653#endif
@@ -695,10 +688,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
695 */ 688 */
696 list_for_each_entry(class, hash_head, hash_entry) { 689 list_for_each_entry(class, hash_head, hash_entry) {
697 if (class->key == key) { 690 if (class->key == key) {
698 /*
699 * Huh! same key, different name? Did someone trample
700 * on some memory? We're most confused.
701 */
702 WARN_ON_ONCE(class->name != lock->name); 691 WARN_ON_ONCE(class->name != lock->name);
703 return class; 692 return class;
704 } 693 }
@@ -722,7 +711,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
722 711
723 class = look_up_lock_class(lock, subclass); 712 class = look_up_lock_class(lock, subclass);
724 if (likely(class)) 713 if (likely(class))
725 goto out_set_class_cache; 714 return class;
726 715
727 /* 716 /*
728 * Debug-check: all keys must be persistent! 717 * Debug-check: all keys must be persistent!
@@ -807,16 +796,11 @@ out_unlock_set:
807 graph_unlock(); 796 graph_unlock();
808 raw_local_irq_restore(flags); 797 raw_local_irq_restore(flags);
809 798
810out_set_class_cache:
811 if (!subclass || force) 799 if (!subclass || force)
812 lock->class_cache[0] = class; 800 lock->class_cache[0] = class;
813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 801 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
814 lock->class_cache[subclass] = class; 802 lock->class_cache[subclass] = class;
815 803
816 /*
817 * Hash collision, did we smoke some? We found a class with a matching
818 * hash but the subclass -- which is hashed in -- didn't match.
819 */
820 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 804 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
821 return NULL; 805 return NULL;
822 806
@@ -943,7 +927,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
943 unsigned long nr; 927 unsigned long nr;
944 928
945 nr = lock - list_entries; 929 nr = lock - list_entries;
946 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ 930 WARN_ON(nr >= nr_list_entries);
947 lock->parent = parent; 931 lock->parent = parent;
948 lock->class->dep_gen_id = lockdep_dependency_gen_id; 932 lock->class->dep_gen_id = lockdep_dependency_gen_id;
949} 933}
@@ -953,7 +937,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
953 unsigned long nr; 937 unsigned long nr;
954 938
955 nr = lock - list_entries; 939 nr = lock - list_entries;
956 WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ 940 WARN_ON(nr >= nr_list_entries);
957 return lock->class->dep_gen_id == lockdep_dependency_gen_id; 941 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
958} 942}
959 943
@@ -1146,11 +1130,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1146 if (debug_locks_silent) 1130 if (debug_locks_silent)
1147 return 0; 1131 return 0;
1148 1132
1149 printk("\n"); 1133 printk("\n=======================================================\n");
1150 printk("======================================================\n"); 1134 printk( "[ INFO: possible circular locking dependency detected ]\n");
1151 printk("[ INFO: possible circular locking dependency detected ]\n"); 1135 print_kernel_version();
1152 print_kernel_ident(); 1136 printk( "-------------------------------------------------------\n");
1153 printk("-------------------------------------------------------\n");
1154 printk("%s/%d is trying to acquire lock:\n", 1137 printk("%s/%d is trying to acquire lock:\n",
1155 curr->comm, task_pid_nr(curr)); 1138 curr->comm, task_pid_nr(curr));
1156 print_lock(check_src); 1139 print_lock(check_src);
@@ -1214,9 +1197,6 @@ static noinline int print_bfs_bug(int ret)
1214 if (!debug_locks_off_graph_unlock()) 1197 if (!debug_locks_off_graph_unlock())
1215 return 0; 1198 return 0;
1216 1199
1217 /*
1218 * Breadth-first-search failed, graph got corrupted?
1219 */
1220 WARN(1, "lockdep bfs error:%d\n", ret); 1200 WARN(1, "lockdep bfs error:%d\n", ret);
1221 1201
1222 return 0; 1202 return 0;
@@ -1484,12 +1464,11 @@ print_bad_irq_dependency(struct task_struct *curr,
1484 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1464 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1485 return 0; 1465 return 0;
1486 1466
1487 printk("\n"); 1467 printk("\n======================================================\n");
1488 printk("======================================================\n"); 1468 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1490 irqclass, irqclass); 1469 irqclass, irqclass);
1491 print_kernel_ident(); 1470 print_kernel_version();
1492 printk("------------------------------------------------------\n"); 1471 printk( "------------------------------------------------------\n");
1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1472 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1494 curr->comm, task_pid_nr(curr), 1473 curr->comm, task_pid_nr(curr),
1495 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1474 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1714,11 +1693,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1714 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1693 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1715 return 0; 1694 return 0;
1716 1695
1717 printk("\n"); 1696 printk("\n=============================================\n");
1718 printk("=============================================\n"); 1697 printk( "[ INFO: possible recursive locking detected ]\n");
1719 printk("[ INFO: possible recursive locking detected ]\n"); 1698 print_kernel_version();
1720 print_kernel_ident(); 1699 printk( "---------------------------------------------\n");
1721 printk("---------------------------------------------\n");
1722 printk("%s/%d is trying to acquire lock:\n", 1700 printk("%s/%d is trying to acquire lock:\n",
1723 curr->comm, task_pid_nr(curr)); 1701 curr->comm, task_pid_nr(curr));
1724 print_lock(next); 1702 print_lock(next);
@@ -1967,11 +1945,6 @@ out_bug:
1967 if (!debug_locks_off_graph_unlock()) 1945 if (!debug_locks_off_graph_unlock())
1968 return 0; 1946 return 0;
1969 1947
1970 /*
1971 * Clearly we all shouldn't be here, but since we made it we
1972 * can reliable say we messed up our state. See the above two
1973 * gotos for reasons why we could possibly end up here.
1974 */
1975 WARN_ON(1); 1948 WARN_ON(1);
1976 1949
1977 return 0; 1950 return 0;
@@ -2003,11 +1976,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2003 struct held_lock *hlock_curr, *hlock_next; 1976 struct held_lock *hlock_curr, *hlock_next;
2004 int i, j; 1977 int i, j;
2005 1978
2006 /*
2007 * We might need to take the graph lock, ensure we've got IRQs
2008 * disabled to make this an IRQ-safe lock.. for recursion reasons
2009 * lockdep won't complain about its own locking errors.
2010 */
2011 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1979 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2012 return 0; 1980 return 0;
2013 /* 1981 /*
@@ -2159,10 +2127,6 @@ static void check_chain_key(struct task_struct *curr)
2159 hlock = curr->held_locks + i; 2127 hlock = curr->held_locks + i;
2160 if (chain_key != hlock->prev_chain_key) { 2128 if (chain_key != hlock->prev_chain_key) {
2161 debug_locks_off(); 2129 debug_locks_off();
2162 /*
2163 * We got mighty confused, our chain keys don't match
2164 * with what we expect, someone trample on our task state?
2165 */
2166 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", 2130 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
2167 curr->lockdep_depth, i, 2131 curr->lockdep_depth, i,
2168 (unsigned long long)chain_key, 2132 (unsigned long long)chain_key,
@@ -2170,9 +2134,6 @@ static void check_chain_key(struct task_struct *curr)
2170 return; 2134 return;
2171 } 2135 }
2172 id = hlock->class_idx - 1; 2136 id = hlock->class_idx - 1;
2173 /*
2174 * Whoops ran out of static storage again?
2175 */
2176 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 2137 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2177 return; 2138 return;
2178 2139
@@ -2184,10 +2145,6 @@ static void check_chain_key(struct task_struct *curr)
2184 } 2145 }
2185 if (chain_key != curr->curr_chain_key) { 2146 if (chain_key != curr->curr_chain_key) {
2186 debug_locks_off(); 2147 debug_locks_off();
2187 /*
2188 * More smoking hash instead of calculating it, damn see these
2189 * numbers float.. I bet that a pink elephant stepped on my memory.
2190 */
2191 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", 2148 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
2192 curr->lockdep_depth, i, 2149 curr->lockdep_depth, i,
2193 (unsigned long long)chain_key, 2150 (unsigned long long)chain_key,
@@ -2221,11 +2178,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2221 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2178 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2222 return 0; 2179 return 0;
2223 2180
2224 printk("\n"); 2181 printk("\n=================================\n");
2225 printk("=================================\n"); 2182 printk( "[ INFO: inconsistent lock state ]\n");
2226 printk("[ INFO: inconsistent lock state ]\n"); 2183 print_kernel_version();
2227 print_kernel_ident(); 2184 printk( "---------------------------------\n");
2228 printk("---------------------------------\n");
2229 2185
2230 printk("inconsistent {%s} -> {%s} usage.\n", 2186 printk("inconsistent {%s} -> {%s} usage.\n",
2231 usage_str[prev_bit], usage_str[new_bit]); 2187 usage_str[prev_bit], usage_str[new_bit]);
@@ -2286,11 +2242,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2286 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2242 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2287 return 0; 2243 return 0;
2288 2244
2289 printk("\n"); 2245 printk("\n=========================================================\n");
2290 printk("=========================================================\n"); 2246 printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2247 print_kernel_version();
2292 print_kernel_ident(); 2248 printk( "---------------------------------------------------------\n");
2293 printk("---------------------------------------------------------\n");
2294 printk("%s/%d just changed the state of lock:\n", 2249 printk("%s/%d just changed the state of lock:\n",
2295 curr->comm, task_pid_nr(curr)); 2250 curr->comm, task_pid_nr(curr));
2296 print_lock(this); 2251 print_lock(this);
@@ -2571,24 +2526,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2571 return; 2526 return;
2572 } 2527 }
2573 2528
2574 /*
2575 * We're enabling irqs and according to our state above irqs weren't
2576 * already enabled, yet we find the hardware thinks they are in fact
2577 * enabled.. someone messed up their IRQ state tracing.
2578 */
2579 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2529 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2580 return; 2530 return;
2581 2531
2582 /*
2583 * See the fine text that goes along with this variable definition.
2584 */
2585 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2532 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2586 return; 2533 return;
2587 2534
2588 /*
2589 * Can't allow enabling interrupts while in an interrupt handler,
2590 * that's general bad form and such. Recursion, limited stack etc..
2591 */
2592 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2535 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2593 return; 2536 return;
2594 2537
@@ -2616,10 +2559,6 @@ void trace_hardirqs_off_caller(unsigned long ip)
2616 if (unlikely(!debug_locks || current->lockdep_recursion)) 2559 if (unlikely(!debug_locks || current->lockdep_recursion))
2617 return; 2560 return;
2618 2561
2619 /*
2620 * So we're supposed to get called after you mask local IRQs, but for
2621 * some reason the hardware doesn't quite think you did a proper job.
2622 */
2623 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2562 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2624 return; 2563 return;
2625 2564
@@ -2652,10 +2591,6 @@ void trace_softirqs_on(unsigned long ip)
2652 if (unlikely(!debug_locks || current->lockdep_recursion)) 2591 if (unlikely(!debug_locks || current->lockdep_recursion))
2653 return; 2592 return;
2654 2593
2655 /*
2656 * We fancy IRQs being disabled here, see softirq.c, avoids
2657 * funny state and nesting things.
2658 */
2659 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2594 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2660 return; 2595 return;
2661 2596
@@ -2692,9 +2627,6 @@ void trace_softirqs_off(unsigned long ip)
2692 if (unlikely(!debug_locks || current->lockdep_recursion)) 2627 if (unlikely(!debug_locks || current->lockdep_recursion))
2693 return; 2628 return;
2694 2629
2695 /*
2696 * We fancy IRQs being disabled here, see softirq.c
2697 */
2698 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2630 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2699 return; 2631 return;
2700 2632
@@ -2706,9 +2638,6 @@ void trace_softirqs_off(unsigned long ip)
2706 curr->softirq_disable_ip = ip; 2638 curr->softirq_disable_ip = ip;
2707 curr->softirq_disable_event = ++curr->irq_events; 2639 curr->softirq_disable_event = ++curr->irq_events;
2708 debug_atomic_inc(softirqs_off_events); 2640 debug_atomic_inc(softirqs_off_events);
2709 /*
2710 * Whoops, we wanted softirqs off, so why aren't they?
2711 */
2712 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2641 DEBUG_LOCKS_WARN_ON(!softirq_count());
2713 } else 2642 } else
2714 debug_atomic_inc(redundant_softirqs_off); 2643 debug_atomic_inc(redundant_softirqs_off);
@@ -2733,9 +2662,6 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2733 if (!(gfp_mask & __GFP_FS)) 2662 if (!(gfp_mask & __GFP_FS))
2734 return; 2663 return;
2735 2664
2736 /*
2737 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
2738 */
2739 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) 2665 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2740 return; 2666 return;
2741 2667
@@ -2848,13 +2774,13 @@ static int separate_irq_context(struct task_struct *curr,
2848 return 0; 2774 return 0;
2849} 2775}
2850 2776
2851#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ 2777#else
2852 2778
2853static inline 2779static inline
2854int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 2780int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2855 enum lock_usage_bit new_bit) 2781 enum lock_usage_bit new_bit)
2856{ 2782{
2857 WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ 2783 WARN_ON(1);
2858 return 1; 2784 return 1;
2859} 2785}
2860 2786
@@ -2874,7 +2800,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
2874{ 2800{
2875} 2801}
2876 2802
2877#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ 2803#endif
2878 2804
2879/* 2805/*
2880 * Mark a lock with a usage bit, and validate the state transition: 2806 * Mark a lock with a usage bit, and validate the state transition:
@@ -2960,9 +2886,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2960 lock->cpu = raw_smp_processor_id(); 2886 lock->cpu = raw_smp_processor_id();
2961#endif 2887#endif
2962 2888
2963 /*
2964 * Can't be having no nameless bastards around this place!
2965 */
2966 if (DEBUG_LOCKS_WARN_ON(!name)) { 2889 if (DEBUG_LOCKS_WARN_ON(!name)) {
2967 lock->name = "NULL"; 2890 lock->name = "NULL";
2968 return; 2891 return;
@@ -2970,9 +2893,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2970 2893
2971 lock->name = name; 2894 lock->name = name;
2972 2895
2973 /*
2974 * No key, no joy, we need to hash something.
2975 */
2976 if (DEBUG_LOCKS_WARN_ON(!key)) 2896 if (DEBUG_LOCKS_WARN_ON(!key))
2977 return; 2897 return;
2978 /* 2898 /*
@@ -2980,9 +2900,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2980 */ 2900 */
2981 if (!static_obj(key)) { 2901 if (!static_obj(key)) {
2982 printk("BUG: key %p not in .data!\n", key); 2902 printk("BUG: key %p not in .data!\n", key);
2983 /*
2984 * What it says above ^^^^^, I suggest you read it.
2985 */
2986 DEBUG_LOCKS_WARN_ON(1); 2903 DEBUG_LOCKS_WARN_ON(1);
2987 return; 2904 return;
2988 } 2905 }
@@ -2998,42 +2915,6 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2998 2915
2999struct lock_class_key __lockdep_no_validate__; 2916struct lock_class_key __lockdep_no_validate__;
3000 2917
3001static int
3002print_lock_nested_lock_not_held(struct task_struct *curr,
3003 struct held_lock *hlock,
3004 unsigned long ip)
3005{
3006 if (!debug_locks_off())
3007 return 0;
3008 if (debug_locks_silent)
3009 return 0;
3010
3011 printk("\n");
3012 printk("==================================\n");
3013 printk("[ BUG: Nested lock was not taken ]\n");
3014 print_kernel_ident();
3015 printk("----------------------------------\n");
3016
3017 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3018 print_lock(hlock);
3019
3020 printk("\nbut this task is not holding:\n");
3021 printk("%s\n", hlock->nest_lock->name);
3022
3023 printk("\nstack backtrace:\n");
3024 dump_stack();
3025
3026 printk("\nother info that might help us debug this:\n");
3027 lockdep_print_held_locks(curr);
3028
3029 printk("\nstack backtrace:\n");
3030 dump_stack();
3031
3032 return 0;
3033}
3034
3035static int __lock_is_held(struct lockdep_map *lock);
3036
3037/* 2918/*
3038 * This gets called for every mutex_lock*()/spin_lock*() operation. 2919 * This gets called for every mutex_lock*()/spin_lock*() operation.
3039 * We maintain the dependency maps and validate the locking attempt: 2920 * We maintain the dependency maps and validate the locking attempt:
@@ -3057,11 +2938,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3057 if (unlikely(!debug_locks)) 2938 if (unlikely(!debug_locks))
3058 return 0; 2939 return 0;
3059 2940
3060 /*
3061 * Lockdep should run with IRQs disabled, otherwise we could
3062 * get an interrupt which would want to take locks, which would
3063 * end up in lockdep and have you got a head-ache already?
3064 */
3065 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2941 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3066 return 0; 2942 return 0;
3067 2943
@@ -3093,9 +2969,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3093 * dependency checks are done) 2969 * dependency checks are done)
3094 */ 2970 */
3095 depth = curr->lockdep_depth; 2971 depth = curr->lockdep_depth;
3096 /*
3097 * Ran out of static storage for our per-task lock stack again have we?
3098 */
3099 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2972 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
3100 return 0; 2973 return 0;
3101 2974
@@ -3114,10 +2987,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3114 } 2987 }
3115 2988
3116 hlock = curr->held_locks + depth; 2989 hlock = curr->held_locks + depth;
3117 /*
3118 * Plain impossible, we just registered it and checked it weren't no
3119 * NULL like.. I bet this mushroom I ate was good!
3120 */
3121 if (DEBUG_LOCKS_WARN_ON(!class)) 2990 if (DEBUG_LOCKS_WARN_ON(!class))
3122 return 0; 2991 return 0;
3123 hlock->class_idx = class_idx; 2992 hlock->class_idx = class_idx;
@@ -3152,17 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3152 * the hash, not class->key. 3021 * the hash, not class->key.
3153 */ 3022 */
3154 id = class - lock_classes; 3023 id = class - lock_classes;
3155 /*
3156 * Whoops, we did it again.. ran straight out of our static allocation.
3157 */
3158 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 3024 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
3159 return 0; 3025 return 0;
3160 3026
3161 chain_key = curr->curr_chain_key; 3027 chain_key = curr->curr_chain_key;
3162 if (!depth) { 3028 if (!depth) {
3163 /*
3164 * How can we have a chain hash when we ain't got no keys?!
3165 */
3166 if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) 3029 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
3167 return 0; 3030 return 0;
3168 chain_head = 1; 3031 chain_head = 1;
@@ -3175,9 +3038,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3175 } 3038 }
3176 chain_key = iterate_chain_key(chain_key, id); 3039 chain_key = iterate_chain_key(chain_key, id);
3177 3040
3178 if (nest_lock && !__lock_is_held(nest_lock))
3179 return print_lock_nested_lock_not_held(curr, hlock, ip);
3180
3181 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3041 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
3182 return 0; 3042 return 0;
3183 3043
@@ -3211,11 +3071,9 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3211 if (debug_locks_silent) 3071 if (debug_locks_silent)
3212 return 0; 3072 return 0;
3213 3073
3214 printk("\n"); 3074 printk("\n=====================================\n");
3215 printk("=====================================\n"); 3075 printk( "[ BUG: bad unlock balance detected! ]\n");
3216 printk("[ BUG: bad unlock balance detected! ]\n"); 3076 printk( "-------------------------------------\n");
3217 print_kernel_ident();
3218 printk("-------------------------------------\n");
3219 printk("%s/%d is trying to release lock (", 3077 printk("%s/%d is trying to release lock (",
3220 curr->comm, task_pid_nr(curr)); 3078 curr->comm, task_pid_nr(curr));
3221 print_lockdep_cache(lock); 3079 print_lockdep_cache(lock);
@@ -3239,9 +3097,6 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3239{ 3097{
3240 if (unlikely(!debug_locks)) 3098 if (unlikely(!debug_locks))
3241 return 0; 3099 return 0;
3242 /*
3243 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
3244 */
3245 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3100 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3246 return 0; 3101 return 0;
3247 3102
@@ -3271,11 +3126,6 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3271 if (!class) 3126 if (!class)
3272 return 0; 3127 return 0;
3273 3128
3274 /*
3275 * References, but not a lock we're actually ref-counting?
3276 * State got messed up, follow the sites that change ->references
3277 * and try to make sense of it.
3278 */
3279 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3129 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
3280 return 0; 3130 return 0;
3281 3131
@@ -3298,10 +3148,6 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3298 int i; 3148 int i;
3299 3149
3300 depth = curr->lockdep_depth; 3150 depth = curr->lockdep_depth;
3301 /*
3302 * This function is about (re)setting the class of a held lock,
3303 * yet we're not actually holding any locks. Naughty user!
3304 */
3305 if (DEBUG_LOCKS_WARN_ON(!depth)) 3151 if (DEBUG_LOCKS_WARN_ON(!depth))
3306 return 0; 3152 return 0;
3307 3153
@@ -3337,10 +3183,6 @@ found_it:
3337 return 0; 3183 return 0;
3338 } 3184 }
3339 3185
3340 /*
3341 * I took it apart and put it back together again, except now I have
3342 * these 'spare' parts.. where shall I put them.
3343 */
3344 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 3186 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
3345 return 0; 3187 return 0;
3346 return 1; 3188 return 1;
@@ -3365,10 +3207,6 @@ lock_release_non_nested(struct task_struct *curr,
3365 * of held locks: 3207 * of held locks:
3366 */ 3208 */
3367 depth = curr->lockdep_depth; 3209 depth = curr->lockdep_depth;
3368 /*
3369 * So we're all set to release this lock.. wait what lock? We don't
3370 * own any locks, you've been drinking again?
3371 */
3372 if (DEBUG_LOCKS_WARN_ON(!depth)) 3210 if (DEBUG_LOCKS_WARN_ON(!depth))
3373 return 0; 3211 return 0;
3374 3212
@@ -3421,10 +3259,6 @@ found_it:
3421 return 0; 3259 return 0;
3422 } 3260 }
3423 3261
3424 /*
3425 * We had N bottles of beer on the wall, we drank one, but now
3426 * there's not N-1 bottles of beer left on the wall...
3427 */
3428 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) 3262 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
3429 return 0; 3263 return 0;
3430 return 1; 3264 return 1;
@@ -3455,9 +3289,6 @@ static int lock_release_nested(struct task_struct *curr,
3455 return lock_release_non_nested(curr, lock, ip); 3289 return lock_release_non_nested(curr, lock, ip);
3456 curr->lockdep_depth--; 3290 curr->lockdep_depth--;
3457 3291
3458 /*
3459 * No more locks, but somehow we've got hash left over, who left it?
3460 */
3461 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) 3292 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
3462 return 0; 3293 return 0;
3463 3294
@@ -3540,13 +3371,10 @@ static void check_flags(unsigned long flags)
3540 * check if not in hardirq contexts: 3371 * check if not in hardirq contexts:
3541 */ 3372 */
3542 if (!hardirq_count()) { 3373 if (!hardirq_count()) {
3543 if (softirq_count()) { 3374 if (softirq_count())
3544 /* like the above, but with softirqs */
3545 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); 3375 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
3546 } else { 3376 else
3547 /* lick the above, does it taste good? */
3548 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); 3377 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
3549 }
3550 } 3378 }
3551 3379
3552 if (!debug_locks) 3380 if (!debug_locks)
@@ -3656,11 +3484,9 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3656 if (debug_locks_silent) 3484 if (debug_locks_silent)
3657 return 0; 3485 return 0;
3658 3486
3659 printk("\n"); 3487 printk("\n=================================\n");
3660 printk("=================================\n"); 3488 printk( "[ BUG: bad contention detected! ]\n");
3661 printk("[ BUG: bad contention detected! ]\n"); 3489 printk( "---------------------------------\n");
3662 print_kernel_ident();
3663 printk("---------------------------------\n");
3664 printk("%s/%d is trying to contend lock (", 3490 printk("%s/%d is trying to contend lock (",
3665 curr->comm, task_pid_nr(curr)); 3491 curr->comm, task_pid_nr(curr));
3666 print_lockdep_cache(lock); 3492 print_lockdep_cache(lock);
@@ -3686,10 +3512,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3686 int i, contention_point, contending_point; 3512 int i, contention_point, contending_point;
3687 3513
3688 depth = curr->lockdep_depth; 3514 depth = curr->lockdep_depth;
3689 /*
3690 * Whee, we contended on this lock, except it seems we're not
3691 * actually trying to acquire anything much at all..
3692 */
3693 if (DEBUG_LOCKS_WARN_ON(!depth)) 3515 if (DEBUG_LOCKS_WARN_ON(!depth))
3694 return; 3516 return;
3695 3517
@@ -3739,10 +3561,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3739 int i, cpu; 3561 int i, cpu;
3740 3562
3741 depth = curr->lockdep_depth; 3563 depth = curr->lockdep_depth;
3742 /*
3743 * Yay, we acquired ownership of this lock we didn't try to
3744 * acquire, how the heck did that happen?
3745 */
3746 if (DEBUG_LOCKS_WARN_ON(!depth)) 3564 if (DEBUG_LOCKS_WARN_ON(!depth))
3747 return; 3565 return;
3748 3566
@@ -3947,12 +3765,8 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3947 match |= class == lock->class_cache[j]; 3765 match |= class == lock->class_cache[j];
3948 3766
3949 if (unlikely(match)) { 3767 if (unlikely(match)) {
3950 if (debug_locks_off_graph_unlock()) { 3768 if (debug_locks_off_graph_unlock())
3951 /*
3952 * We all just reset everything, how did it match?
3953 */
3954 WARN_ON(1); 3769 WARN_ON(1);
3955 }
3956 goto out_restore; 3770 goto out_restore;
3957 } 3771 }
3958 } 3772 }
@@ -4015,8 +3829,7 @@ void __init lockdep_info(void)
4015 3829
4016#ifdef CONFIG_DEBUG_LOCKDEP 3830#ifdef CONFIG_DEBUG_LOCKDEP
4017 if (lockdep_init_error) { 3831 if (lockdep_init_error) {
4018 printk("WARNING: lockdep init error! lock-%s was acquired" 3832 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
4019 "before lockdep_init\n", lock_init_error);
4020 printk("Call stack leading to lockdep invocation was:\n"); 3833 printk("Call stack leading to lockdep invocation was:\n");
4021 print_stack_trace(&lockdep_init_trace, 0); 3834 print_stack_trace(&lockdep_init_trace, 0);
4022 } 3835 }
@@ -4032,11 +3845,9 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4032 if (debug_locks_silent) 3845 if (debug_locks_silent)
4033 return; 3846 return;
4034 3847
4035 printk("\n"); 3848 printk("\n=========================\n");
4036 printk("=========================\n"); 3849 printk( "[ BUG: held lock freed! ]\n");
4037 printk("[ BUG: held lock freed! ]\n"); 3850 printk( "-------------------------\n");
4038 print_kernel_ident();
4039 printk("-------------------------\n");
4040 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3851 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
4041 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 3852 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4042 print_lock(hlock); 3853 print_lock(hlock);
@@ -4090,11 +3901,9 @@ static void print_held_locks_bug(struct task_struct *curr)
4090 if (debug_locks_silent) 3901 if (debug_locks_silent)
4091 return; 3902 return;
4092 3903
4093 printk("\n"); 3904 printk("\n=====================================\n");
4094 printk("=====================================\n"); 3905 printk( "[ BUG: lock held at task exit time! ]\n");
4095 printk("[ BUG: lock held at task exit time! ]\n"); 3906 printk( "-------------------------------------\n");
4096 print_kernel_ident();
4097 printk("-------------------------------------\n");
4098 printk("%s/%d is exiting with locks still held!\n", 3907 printk("%s/%d is exiting with locks still held!\n",
4099 curr->comm, task_pid_nr(curr)); 3908 curr->comm, task_pid_nr(curr));
4100 lockdep_print_held_locks(curr); 3909 lockdep_print_held_locks(curr);
@@ -4188,18 +3997,16 @@ void lockdep_sys_exit(void)
4188 if (unlikely(curr->lockdep_depth)) { 3997 if (unlikely(curr->lockdep_depth)) {
4189 if (!debug_locks_off()) 3998 if (!debug_locks_off())
4190 return; 3999 return;
4191 printk("\n"); 4000 printk("\n================================================\n");
4192 printk("================================================\n"); 4001 printk( "[ BUG: lock held when returning to user space! ]\n");
4193 printk("[ BUG: lock held when returning to user space! ]\n"); 4002 printk( "------------------------------------------------\n");
4194 print_kernel_ident();
4195 printk("------------------------------------------------\n");
4196 printk("%s/%d is leaving the kernel with locks still held!\n", 4003 printk("%s/%d is leaving the kernel with locks still held!\n",
4197 curr->comm, curr->pid); 4004 curr->comm, curr->pid);
4198 lockdep_print_held_locks(curr); 4005 lockdep_print_held_locks(curr);
4199 } 4006 }
4200} 4007}
4201 4008
4202void lockdep_rcu_suspicious(const char *file, const int line, const char *s) 4009void lockdep_rcu_dereference(const char *file, const int line)
4203{ 4010{
4204 struct task_struct *curr = current; 4011 struct task_struct *curr = current;
4205 4012
@@ -4208,44 +4015,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4208 return; 4015 return;
4209#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4016#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4210 /* Note: the following can be executed concurrently, so be careful. */ 4017 /* Note: the following can be executed concurrently, so be careful. */
4211 printk("\n"); 4018 printk("\n===================================================\n");
4212 printk("===============================\n"); 4019 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
4213 printk("[ INFO: suspicious RCU usage. ]\n"); 4020 printk( "---------------------------------------------------\n");
4214 print_kernel_ident(); 4021 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
4215 printk("-------------------------------\n"); 4022 file, line);
4216 printk("%s:%d %s!\n", file, line, s);
4217 printk("\nother info that might help us debug this:\n\n"); 4023 printk("\nother info that might help us debug this:\n\n");
4218 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4024 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4219 !rcu_lockdep_current_cpu_online()
4220 ? "RCU used illegally from offline CPU!\n"
4221 : rcu_is_cpu_idle()
4222 ? "RCU used illegally from idle CPU!\n"
4223 : "",
4224 rcu_scheduler_active, debug_locks);
4225
4226 /*
4227 * If a CPU is in the RCU-free window in idle (ie: in the section
4228 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4229 * considers that CPU to be in an "extended quiescent state",
4230 * which means that RCU will be completely ignoring that CPU.
4231 * Therefore, rcu_read_lock() and friends have absolutely no
4232 * effect on a CPU running in that state. In other words, even if
4233 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4234 * delete data structures out from under it. RCU really has no
4235 * choice here: we need to keep an RCU-free window in idle where
4236 * the CPU may possibly enter into low power mode. This way we can
4237 * notice an extended quiescent state to other CPUs that started a grace
4238 * period. Otherwise we would delay any grace period as long as we run
4239 * in the idle task.
4240 *
4241 * So complain bitterly if someone does call rcu_read_lock(),
4242 * rcu_read_lock_bh() and so on from extended quiescent states.
4243 */
4244 if (rcu_is_cpu_idle())
4245 printk("RCU used illegally from extended quiescent state!\n");
4246
4247 lockdep_print_held_locks(curr); 4025 lockdep_print_held_locks(curr);
4248 printk("\nstack backtrace:\n"); 4026 printk("\nstack backtrace:\n");
4249 dump_stack(); 4027 dump_stack();
4250} 4028}
4251EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); 4029EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b2c71c5873e..71edd2f60c0 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
11 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
12 * 12 *
13 */ 13 */
14#include <linux/export.h> 14#include <linux/module.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
39 39
40static void print_name(struct seq_file *m, struct lock_class *class) 40static void print_name(struct seq_file *m, struct lock_class *class)
41{ 41{
42 char str[KSYM_NAME_LEN]; 42 char str[128];
43 const char *name = class->name; 43 const char *name = class->name;
44 44
45 if (!name) { 45 if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644
index 246b4c6e613..00000000000
--- a/kernel/modsign_certificate.S
+++ /dev/null
@@ -1,19 +0,0 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9
10#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \
12 ASM_SYMBOL(name):
13
14 .section ".init.data","aw"
15
16GLOBAL(modsign_certificate_list)
17 .incbin "signing_key.x509"
18 .incbin "extra_certificates"
19GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644
index 2b6e69909c3..00000000000
--- a/kernel/modsign_pubkey.c
+++ /dev/null
@@ -1,104 +0,0 @@
1/* Public keys for module signature verification
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/cred.h>
15#include <linux/err.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19struct key *modsign_keyring;
20
21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[];
23
24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes.
27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo";
29
30/*
31 * Load the compiled-in keys
32 */
33static __init int module_verify_init(void)
34{
35 pr_notice("Initialise module verification\n");
36
37 modsign_keyring = keyring_alloc(".module_sign",
38 KUIDT_INIT(0), KGIDT_INIT(0),
39 current_cred(),
40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
41 KEY_USR_VIEW | KEY_USR_READ),
42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
43 if (IS_ERR(modsign_keyring))
44 panic("Can't allocate module signing keyring\n");
45
46 return 0;
47}
48
49/*
50 * Must be initialised before we try and load the keys into the keyring.
51 */
52device_initcall(module_verify_init);
53
54/*
55 * Load the compiled-in keys
56 */
57static __init int load_module_signing_keys(void)
58{
59 key_ref_t key;
60 const u8 *p, *end;
61 size_t plen;
62
63 pr_notice("Loading module verification certificates\n");
64
65 end = modsign_certificate_list_end;
66 p = modsign_certificate_list;
67 while (p < end) {
68 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
69 * than 256 bytes in size.
70 */
71 if (end - p < 4)
72 goto dodgy_cert;
73 if (p[0] != 0x30 &&
74 p[1] != 0x82)
75 goto dodgy_cert;
76 plen = (p[2] << 8) | p[3];
77 plen += 4;
78 if (plen > end - p)
79 goto dodgy_cert;
80
81 key = key_create_or_update(make_key_ref(modsign_keyring, 1),
82 "asymmetric",
83 NULL,
84 p,
85 plen,
86 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
87 KEY_USR_VIEW,
88 KEY_ALLOC_NOT_IN_QUOTA);
89 if (IS_ERR(key))
90 pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
91 PTR_ERR(key));
92 else
93 pr_notice("MODSIGN: Loaded cert '%s'\n",
94 key_ref_to_ptr(key)->description);
95 p += plen;
96 }
97
98 return 0;
99
100dodgy_cert:
101 pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
102 return 0;
103}
104late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 24f9247b7d0..00000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,14 +0,0 @@
1/* Module internals
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12extern struct key *modsign_keyring;
13
14extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 250092c1d57..e0ddcece2be 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,12 +16,11 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/export.h> 19#include <linux/module.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/file.h>
25#include <linux/fs.h> 24#include <linux/fs.h>
26#include <linux/sysfs.h> 25#include <linux/sysfs.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -29,7 +28,6 @@
29#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
30#include <linux/elf.h> 29#include <linux/elf.h>
31#include <linux/proc_fs.h> 30#include <linux/proc_fs.h>
32#include <linux/security.h>
33#include <linux/seq_file.h> 31#include <linux/seq_file.h>
34#include <linux/syscalls.h> 32#include <linux/syscalls.h>
35#include <linux/fcntl.h> 33#include <linux/fcntl.h>
@@ -60,13 +58,16 @@
60#include <linux/jump_label.h> 58#include <linux/jump_label.h>
61#include <linux/pfn.h> 59#include <linux/pfn.h>
62#include <linux/bsearch.h> 60#include <linux/bsearch.h>
63#include <linux/fips.h>
64#include <uapi/linux/module.h>
65#include "module-internal.h"
66 61
67#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
68#include <trace/events/module.h> 63#include <trace/events/module.h>
69 64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(fmt , a...)
69#endif
70
70#ifndef ARCH_SHF_SMALL 71#ifndef ARCH_SHF_SMALL
71#define ARCH_SHF_SMALL 0 72#define ARCH_SHF_SMALL 0
72#endif 73#endif
@@ -107,47 +108,9 @@ static LIST_HEAD(modules);
107struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ 108struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
108#endif /* CONFIG_KGDB_KDB */ 109#endif /* CONFIG_KGDB_KDB */
109 110
110#ifdef CONFIG_MODULE_SIG
111#ifdef CONFIG_MODULE_SIG_FORCE
112static bool sig_enforce = true;
113#else
114static bool sig_enforce = false;
115
116static int param_set_bool_enable_only(const char *val,
117 const struct kernel_param *kp)
118{
119 int err;
120 bool test;
121 struct kernel_param dummy_kp = *kp;
122
123 dummy_kp.arg = &test;
124
125 err = param_set_bool(val, &dummy_kp);
126 if (err)
127 return err;
128
129 /* Don't let them unset it once it's set! */
130 if (!test && sig_enforce)
131 return -EROFS;
132
133 if (test)
134 sig_enforce = true;
135 return 0;
136}
137
138static const struct kernel_param_ops param_ops_bool_enable_only = {
139 .set = param_set_bool_enable_only,
140 .get = param_get_bool,
141};
142#define param_check_bool_enable_only param_check_bool
143
144module_param(sig_enforce, bool_enable_only, 0644);
145#endif /* !CONFIG_MODULE_SIG_FORCE */
146#endif /* CONFIG_MODULE_SIG */
147 111
148/* Block module loading/unloading? */ 112/* Block module loading/unloading? */
149int modules_disabled = 0; 113int modules_disabled = 0;
150core_param(nomodule, modules_disabled, bint, 0);
151 114
152/* Waiting for a module to finish initializing? */ 115/* Waiting for a module to finish initializing? */
153static DECLARE_WAIT_QUEUE_HEAD(module_wq); 116static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -175,10 +138,10 @@ struct load_info {
175 unsigned long len; 138 unsigned long len;
176 Elf_Shdr *sechdrs; 139 Elf_Shdr *sechdrs;
177 char *secstrings, *strtab; 140 char *secstrings, *strtab;
141 unsigned long *strmap;
178 unsigned long symoffs, stroffs; 142 unsigned long symoffs, stroffs;
179 struct _ddebug *debug; 143 struct _ddebug *debug;
180 unsigned int num_debug; 144 unsigned int num_debug;
181 bool sig_ok;
182 struct { 145 struct {
183 unsigned int sym, str, mod, vers, info, pcpu; 146 unsigned int sym, str, mod, vers, info, pcpu;
184 } index; 147 } index;
@@ -375,6 +338,9 @@ static bool check_symbol(const struct symsearch *syms,
375 printk(KERN_WARNING "Symbol %s is being used " 338 printk(KERN_WARNING "Symbol %s is being used "
376 "by a non-GPL module, which will not " 339 "by a non-GPL module, which will not "
377 "be allowed in the future\n", fsa->name); 340 "be allowed in the future\n", fsa->name);
341 printk(KERN_WARNING "Please see the file "
342 "Documentation/feature-removal-schedule.txt "
343 "in the kernel source tree for more details.\n");
378 } 344 }
379 } 345 }
380 346
@@ -444,7 +410,7 @@ const struct kernel_symbol *find_symbol(const char *name,
444 return fsa.sym; 410 return fsa.sym;
445 } 411 }
446 412
447 pr_debug("Failed to find symbol %s\n", name); 413 DEBUGP("Failed to find symbol %s\n", name);
448 return NULL; 414 return NULL;
449} 415}
450EXPORT_SYMBOL_GPL(find_symbol); 416EXPORT_SYMBOL_GPL(find_symbol);
@@ -634,11 +600,11 @@ static int already_uses(struct module *a, struct module *b)
634 600
635 list_for_each_entry(use, &b->source_list, source_list) { 601 list_for_each_entry(use, &b->source_list, source_list) {
636 if (use->source == a) { 602 if (use->source == a) {
637 pr_debug("%s uses %s!\n", a->name, b->name); 603 DEBUGP("%s uses %s!\n", a->name, b->name);
638 return 1; 604 return 1;
639 } 605 }
640 } 606 }
641 pr_debug("%s does not use %s!\n", a->name, b->name); 607 DEBUGP("%s does not use %s!\n", a->name, b->name);
642 return 0; 608 return 0;
643} 609}
644 610
@@ -653,7 +619,7 @@ static int add_module_usage(struct module *a, struct module *b)
653{ 619{
654 struct module_use *use; 620 struct module_use *use;
655 621
656 pr_debug("Allocating new usage for %s.\n", a->name); 622 DEBUGP("Allocating new usage for %s.\n", a->name);
657 use = kmalloc(sizeof(*use), GFP_ATOMIC); 623 use = kmalloc(sizeof(*use), GFP_ATOMIC);
658 if (!use) { 624 if (!use) {
659 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 625 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -697,7 +663,7 @@ static void module_unload_free(struct module *mod)
697 mutex_lock(&module_mutex); 663 mutex_lock(&module_mutex);
698 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { 664 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
699 struct module *i = use->target; 665 struct module *i = use->target;
700 pr_debug("%s unusing %s\n", mod->name, i->name); 666 DEBUGP("%s unusing %s\n", mod->name, i->name);
701 module_put(i); 667 module_put(i);
702 list_del(&use->source_list); 668 list_del(&use->source_list);
703 list_del(&use->target_list); 669 list_del(&use->target_list);
@@ -760,9 +726,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
760 } 726 }
761} 727}
762 728
763unsigned long module_refcount(struct module *mod) 729unsigned int module_refcount(struct module *mod)
764{ 730{
765 unsigned long incs = 0, decs = 0; 731 unsigned int incs = 0, decs = 0;
766 int cpu; 732 int cpu;
767 733
768 for_each_possible_cpu(cpu) 734 for_each_possible_cpu(cpu)
@@ -795,7 +761,7 @@ static void wait_for_zero_refcount(struct module *mod)
795 /* Since we might sleep for some time, release the mutex first */ 761 /* Since we might sleep for some time, release the mutex first */
796 mutex_unlock(&module_mutex); 762 mutex_unlock(&module_mutex);
797 for (;;) { 763 for (;;) {
798 pr_debug("Looking at refcount...\n"); 764 DEBUGP("Looking at refcount...\n");
799 set_current_state(TASK_UNINTERRUPTIBLE); 765 set_current_state(TASK_UNINTERRUPTIBLE);
800 if (module_refcount(mod) == 0) 766 if (module_refcount(mod) == 0)
801 break; 767 break;
@@ -838,7 +804,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
838 if (mod->state != MODULE_STATE_LIVE) { 804 if (mod->state != MODULE_STATE_LIVE) {
839 /* FIXME: if (force), slam module count and wake up 805 /* FIXME: if (force), slam module count and wake up
840 waiter --RR */ 806 waiter --RR */
841 pr_debug("%s already dying\n", mod->name); 807 DEBUGP("%s already dying\n", mod->name);
842 ret = -EBUSY; 808 ret = -EBUSY;
843 goto out; 809 goto out;
844 } 810 }
@@ -888,7 +854,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
888 struct module_use *use; 854 struct module_use *use;
889 int printed_something = 0; 855 int printed_something = 0;
890 856
891 seq_printf(m, " %lu ", module_refcount(mod)); 857 seq_printf(m, " %u ", module_refcount(mod));
892 858
893 /* Always include a trailing , so userspace can differentiate 859 /* Always include a trailing , so userspace can differentiate
894 between this and the old multi-field proc format. */ 860 between this and the old multi-field proc format. */
@@ -938,41 +904,13 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
938static ssize_t show_refcnt(struct module_attribute *mattr, 904static ssize_t show_refcnt(struct module_attribute *mattr,
939 struct module_kobject *mk, char *buffer) 905 struct module_kobject *mk, char *buffer)
940{ 906{
941 return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); 907 return sprintf(buffer, "%u\n", module_refcount(mk->mod));
942}
943
944static struct module_attribute modinfo_refcnt =
945 __ATTR(refcnt, 0444, show_refcnt, NULL);
946
947void __module_get(struct module *module)
948{
949 if (module) {
950 preempt_disable();
951 __this_cpu_inc(module->refptr->incs);
952 trace_module_get(module, _RET_IP_);
953 preempt_enable();
954 }
955} 908}
956EXPORT_SYMBOL(__module_get);
957
958bool try_module_get(struct module *module)
959{
960 bool ret = true;
961
962 if (module) {
963 preempt_disable();
964
965 if (likely(module_is_live(module))) {
966 __this_cpu_inc(module->refptr->incs);
967 trace_module_get(module, _RET_IP_);
968 } else
969 ret = false;
970 909
971 preempt_enable(); 910static struct module_attribute refcnt = {
972 } 911 .attr = { .name = "refcnt", .mode = 0444 },
973 return ret; 912 .show = show_refcnt,
974} 913};
975EXPORT_SYMBOL(try_module_get);
976 914
977void module_put(struct module *module) 915void module_put(struct module *module)
978{ 916{
@@ -1013,26 +951,6 @@ static inline int module_unload_init(struct module *mod)
1013} 951}
1014#endif /* CONFIG_MODULE_UNLOAD */ 952#endif /* CONFIG_MODULE_UNLOAD */
1015 953
1016static size_t module_flags_taint(struct module *mod, char *buf)
1017{
1018 size_t l = 0;
1019
1020 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
1021 buf[l++] = 'P';
1022 if (mod->taints & (1 << TAINT_OOT_MODULE))
1023 buf[l++] = 'O';
1024 if (mod->taints & (1 << TAINT_FORCED_MODULE))
1025 buf[l++] = 'F';
1026 if (mod->taints & (1 << TAINT_CRAP))
1027 buf[l++] = 'C';
1028 /*
1029 * TAINT_FORCED_RMMOD: could be added.
1030 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
1031 * apply to modules.
1032 */
1033 return l;
1034}
1035
1036static ssize_t show_initstate(struct module_attribute *mattr, 954static ssize_t show_initstate(struct module_attribute *mattr,
1037 struct module_kobject *mk, char *buffer) 955 struct module_kobject *mk, char *buffer)
1038{ 956{
@@ -1052,8 +970,10 @@ static ssize_t show_initstate(struct module_attribute *mattr,
1052 return sprintf(buffer, "%s\n", state); 970 return sprintf(buffer, "%s\n", state);
1053} 971}
1054 972
1055static struct module_attribute modinfo_initstate = 973static struct module_attribute initstate = {
1056 __ATTR(initstate, 0444, show_initstate, NULL); 974 .attr = { .name = "initstate", .mode = 0444 },
975 .show = show_initstate,
976};
1057 977
1058static ssize_t store_uevent(struct module_attribute *mattr, 978static ssize_t store_uevent(struct module_attribute *mattr,
1059 struct module_kobject *mk, 979 struct module_kobject *mk,
@@ -1066,50 +986,18 @@ static ssize_t store_uevent(struct module_attribute *mattr,
1066 return count; 986 return count;
1067} 987}
1068 988
1069struct module_attribute module_uevent = 989struct module_attribute module_uevent = {
1070 __ATTR(uevent, 0200, NULL, store_uevent); 990 .attr = { .name = "uevent", .mode = 0200 },
1071 991 .store = store_uevent,
1072static ssize_t show_coresize(struct module_attribute *mattr, 992};
1073 struct module_kobject *mk, char *buffer)
1074{
1075 return sprintf(buffer, "%u\n", mk->mod->core_size);
1076}
1077
1078static struct module_attribute modinfo_coresize =
1079 __ATTR(coresize, 0444, show_coresize, NULL);
1080
1081static ssize_t show_initsize(struct module_attribute *mattr,
1082 struct module_kobject *mk, char *buffer)
1083{
1084 return sprintf(buffer, "%u\n", mk->mod->init_size);
1085}
1086
1087static struct module_attribute modinfo_initsize =
1088 __ATTR(initsize, 0444, show_initsize, NULL);
1089
1090static ssize_t show_taint(struct module_attribute *mattr,
1091 struct module_kobject *mk, char *buffer)
1092{
1093 size_t l;
1094
1095 l = module_flags_taint(mk->mod, buffer);
1096 buffer[l++] = '\n';
1097 return l;
1098}
1099
1100static struct module_attribute modinfo_taint =
1101 __ATTR(taint, 0444, show_taint, NULL);
1102 993
1103static struct module_attribute *modinfo_attrs[] = { 994static struct module_attribute *modinfo_attrs[] = {
1104 &module_uevent,
1105 &modinfo_version, 995 &modinfo_version,
1106 &modinfo_srcversion, 996 &modinfo_srcversion,
1107 &modinfo_initstate, 997 &initstate,
1108 &modinfo_coresize, 998 &module_uevent,
1109 &modinfo_initsize,
1110 &modinfo_taint,
1111#ifdef CONFIG_MODULE_UNLOAD 999#ifdef CONFIG_MODULE_UNLOAD
1112 &modinfo_refcnt, 1000 &refcnt,
1113#endif 1001#endif
1114 NULL, 1002 NULL,
1115}; 1003};
@@ -1169,7 +1057,7 @@ static int check_version(Elf_Shdr *sechdrs,
1169 1057
1170 if (versions[i].crc == maybe_relocated(*crc, crc_owner)) 1058 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1171 return 1; 1059 return 1;
1172 pr_debug("Found checksum %lX vs module %lX\n", 1060 DEBUGP("Found checksum %lX vs module %lX\n",
1173 maybe_relocated(*crc, crc_owner), versions[i].crc); 1061 maybe_relocated(*crc, crc_owner), versions[i].crc);
1174 goto bad_version; 1062 goto bad_version;
1175 } 1063 }
@@ -1946,7 +1834,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1946 case SHN_COMMON: 1834 case SHN_COMMON:
1947 /* We compiled with -fno-common. These are not 1835 /* We compiled with -fno-common. These are not
1948 supposed to happen. */ 1836 supposed to happen. */
1949 pr_debug("Common symbol: %s\n", name); 1837 DEBUGP("Common symbol: %s\n", name);
1950 printk("%s: please compile with -fno-common\n", 1838 printk("%s: please compile with -fno-common\n",
1951 mod->name); 1839 mod->name);
1952 ret = -ENOEXEC; 1840 ret = -ENOEXEC;
@@ -1954,7 +1842,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1954 1842
1955 case SHN_ABS: 1843 case SHN_ABS:
1956 /* Don't need to do anything */ 1844 /* Don't need to do anything */
1957 pr_debug("Absolute symbol: 0x%08lx\n", 1845 DEBUGP("Absolute symbol: 0x%08lx\n",
1958 (long)sym[i].st_value); 1846 (long)sym[i].st_value);
1959 break; 1847 break;
1960 1848
@@ -1989,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1989 return ret; 1877 return ret;
1990} 1878}
1991 1879
1880int __weak apply_relocate(Elf_Shdr *sechdrs,
1881 const char *strtab,
1882 unsigned int symindex,
1883 unsigned int relsec,
1884 struct module *me)
1885{
1886 pr_err("module %s: REL relocation unsupported\n", me->name);
1887 return -ENOEXEC;
1888}
1889
1890int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1891 const char *strtab,
1892 unsigned int symindex,
1893 unsigned int relsec,
1894 struct module *me)
1895{
1896 pr_err("module %s: RELA relocation unsupported\n", me->name);
1897 return -ENOEXEC;
1898}
1899
1992static int apply_relocations(struct module *mod, const struct load_info *info) 1900static int apply_relocations(struct module *mod, const struct load_info *info)
1993{ 1901{
1994 unsigned int i; 1902 unsigned int i;
@@ -2058,7 +1966,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2058 for (i = 0; i < info->hdr->e_shnum; i++) 1966 for (i = 0; i < info->hdr->e_shnum; i++)
2059 info->sechdrs[i].sh_entsize = ~0UL; 1967 info->sechdrs[i].sh_entsize = ~0UL;
2060 1968
2061 pr_debug("Core section allocation order:\n"); 1969 DEBUGP("Core section allocation order:\n");
2062 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1970 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2063 for (i = 0; i < info->hdr->e_shnum; ++i) { 1971 for (i = 0; i < info->hdr->e_shnum; ++i) {
2064 Elf_Shdr *s = &info->sechdrs[i]; 1972 Elf_Shdr *s = &info->sechdrs[i];
@@ -2070,7 +1978,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2070 || strstarts(sname, ".init")) 1978 || strstarts(sname, ".init"))
2071 continue; 1979 continue;
2072 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1980 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
2073 pr_debug("\t%s\n", sname); 1981 DEBUGP("\t%s\n", name);
2074 } 1982 }
2075 switch (m) { 1983 switch (m) {
2076 case 0: /* executable */ 1984 case 0: /* executable */
@@ -2087,7 +1995,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2087 } 1995 }
2088 } 1996 }
2089 1997
2090 pr_debug("Init section allocation order:\n"); 1998 DEBUGP("Init section allocation order:\n");
2091 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1999 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2092 for (i = 0; i < info->hdr->e_shnum; ++i) { 2000 for (i = 0; i < info->hdr->e_shnum; ++i) {
2093 Elf_Shdr *s = &info->sechdrs[i]; 2001 Elf_Shdr *s = &info->sechdrs[i];
@@ -2100,7 +2008,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2100 continue; 2008 continue;
2101 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 2009 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
2102 | INIT_OFFSET_MASK); 2010 | INIT_OFFSET_MASK);
2103 pr_debug("\t%s\n", sname); 2011 DEBUGP("\t%s\n", sname);
2104 } 2012 }
2105 switch (m) { 2013 switch (m) {
2106 case 0: /* executable */ 2014 case 0: /* executable */
@@ -2270,48 +2178,45 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2270 return true; 2178 return true;
2271} 2179}
2272 2180
2273/*
2274 * We only allocate and copy the strings needed by the parts of symtab
2275 * we keep. This is simple, but has the effect of making multiple
2276 * copies of duplicates. We could be more sophisticated, see
2277 * linux-kernel thread starting with
2278 * <73defb5e4bca04a6431392cc341112b1@localhost>.
2279 */
2280static void layout_symtab(struct module *mod, struct load_info *info) 2181static void layout_symtab(struct module *mod, struct load_info *info)
2281{ 2182{
2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2183 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2283 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2184 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2284 const Elf_Sym *src; 2185 const Elf_Sym *src;
2285 unsigned int i, nsrc, ndst, strtab_size = 0; 2186 unsigned int i, nsrc, ndst;
2286 2187
2287 /* Put symbol section at end of init part of module. */ 2188 /* Put symbol section at end of init part of module. */
2288 symsect->sh_flags |= SHF_ALLOC; 2189 symsect->sh_flags |= SHF_ALLOC;
2289 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 2190 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
2290 info->index.sym) | INIT_OFFSET_MASK; 2191 info->index.sym) | INIT_OFFSET_MASK;
2291 pr_debug("\t%s\n", info->secstrings + symsect->sh_name); 2192 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
2292 2193
2293 src = (void *)info->hdr + symsect->sh_offset; 2194 src = (void *)info->hdr + symsect->sh_offset;
2294 nsrc = symsect->sh_size / sizeof(*src); 2195 nsrc = symsect->sh_size / sizeof(*src);
2295 2196 for (ndst = i = 1; i < nsrc; ++i, ++src)
2296 /* Compute total space required for the core symbols' strtab. */ 2197 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2297 for (ndst = i = 0; i < nsrc; i++) { 2198 unsigned int j = src->st_name;
2298 if (i == 0 || 2199
2299 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { 2200 while (!__test_and_set_bit(j, info->strmap)
2300 strtab_size += strlen(&info->strtab[src[i].st_name])+1; 2201 && info->strtab[j])
2301 ndst++; 2202 ++j;
2203 ++ndst;
2302 } 2204 }
2303 }
2304 2205
2305 /* Append room for core symbols at end of core part. */ 2206 /* Append room for core symbols at end of core part. */
2306 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2207 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
2307 info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); 2208 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
2308 mod->core_size += strtab_size;
2309 2209
2310 /* Put string table section at end of init part of module. */ 2210 /* Put string table section at end of init part of module. */
2311 strsect->sh_flags |= SHF_ALLOC; 2211 strsect->sh_flags |= SHF_ALLOC;
2312 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 2212 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
2313 info->index.str) | INIT_OFFSET_MASK; 2213 info->index.str) | INIT_OFFSET_MASK;
2314 pr_debug("\t%s\n", info->secstrings + strsect->sh_name); 2214 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
2215
2216 /* Append room for core symbols' strings at end of core part. */
2217 info->stroffs = mod->core_size;
2218 __set_bit(0, info->strmap);
2219 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
2315} 2220}
2316 2221
2317static void add_kallsyms(struct module *mod, const struct load_info *info) 2222static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2332,18 +2237,22 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2332 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); 2237 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2333 2238
2334 mod->core_symtab = dst = mod->module_core + info->symoffs; 2239 mod->core_symtab = dst = mod->module_core + info->symoffs;
2335 mod->core_strtab = s = mod->module_core + info->stroffs;
2336 src = mod->symtab; 2240 src = mod->symtab;
2337 for (ndst = i = 0; i < mod->num_symtab; i++) { 2241 *dst = *src;
2338 if (i == 0 || 2242 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2339 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { 2243 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2340 dst[ndst] = src[i]; 2244 continue;
2341 dst[ndst++].st_name = s - mod->core_strtab; 2245 dst[ndst] = *src;
2342 s += strlcpy(s, &mod->strtab[src[i].st_name], 2246 dst[ndst].st_name = bitmap_weight(info->strmap,
2343 KSYM_NAME_LEN) + 1; 2247 dst[ndst].st_name);
2344 } 2248 ++ndst;
2345 } 2249 }
2346 mod->core_num_syms = ndst; 2250 mod->core_num_syms = ndst;
2251
2252 mod->core_strtab = s = mod->module_core + info->stroffs;
2253 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2254 if (test_bit(i, info->strmap))
2255 *++s = mod->strtab[i];
2347} 2256}
2348#else 2257#else
2349static inline void layout_symtab(struct module *mod, struct load_info *info) 2258static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2374,7 +2283,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2374 2283
2375void * __weak module_alloc(unsigned long size) 2284void * __weak module_alloc(unsigned long size)
2376{ 2285{
2377 return vmalloc_exec(size); 2286 return size == 0 ? NULL : vmalloc_exec(size);
2378} 2287}
2379 2288
2380static void *module_alloc_update_bounds(unsigned long size) 2289static void *module_alloc_update_bounds(unsigned long size)
@@ -2420,136 +2329,48 @@ static inline void kmemleak_load_module(const struct module *mod,
2420} 2329}
2421#endif 2330#endif
2422 2331
2423#ifdef CONFIG_MODULE_SIG
2424static int module_sig_check(struct load_info *info)
2425{
2426 int err = -ENOKEY;
2427 const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2428 const void *mod = info->hdr;
2429
2430 if (info->len > markerlen &&
2431 memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2432 /* We truncate the module to discard the signature */
2433 info->len -= markerlen;
2434 err = mod_verify_sig(mod, &info->len);
2435 }
2436
2437 if (!err) {
2438 info->sig_ok = true;
2439 return 0;
2440 }
2441
2442 /* Not having a signature is only an error if we're strict. */
2443 if (err < 0 && fips_enabled)
2444 panic("Module verification failed with error %d in FIPS mode\n",
2445 err);
2446 if (err == -ENOKEY && !sig_enforce)
2447 err = 0;
2448
2449 return err;
2450}
2451#else /* !CONFIG_MODULE_SIG */
2452static int module_sig_check(struct load_info *info)
2453{
2454 return 0;
2455}
2456#endif /* !CONFIG_MODULE_SIG */
2457
2458/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
2459static int elf_header_check(struct load_info *info)
2460{
2461 if (info->len < sizeof(*(info->hdr)))
2462 return -ENOEXEC;
2463
2464 if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
2465 || info->hdr->e_type != ET_REL
2466 || !elf_check_arch(info->hdr)
2467 || info->hdr->e_shentsize != sizeof(Elf_Shdr))
2468 return -ENOEXEC;
2469
2470 if (info->hdr->e_shoff >= info->len
2471 || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
2472 info->len - info->hdr->e_shoff))
2473 return -ENOEXEC;
2474
2475 return 0;
2476}
2477
2478/* Sets info->hdr and info->len. */ 2332/* Sets info->hdr and info->len. */
2479static int copy_module_from_user(const void __user *umod, unsigned long len, 2333static int copy_and_check(struct load_info *info,
2480 struct load_info *info) 2334 const void __user *umod, unsigned long len,
2335 const char __user *uargs)
2481{ 2336{
2482 int err; 2337 int err;
2338 Elf_Ehdr *hdr;
2483 2339
2484 info->len = len; 2340 if (len < sizeof(*hdr))
2485 if (info->len < sizeof(*(info->hdr)))
2486 return -ENOEXEC; 2341 return -ENOEXEC;
2487 2342
2488 err = security_kernel_module_from_file(NULL);
2489 if (err)
2490 return err;
2491
2492 /* Suck in entire file: we'll want most of it. */ 2343 /* Suck in entire file: we'll want most of it. */
2493 info->hdr = vmalloc(info->len); 2344 /* vmalloc barfs on "unusual" numbers. Check here */
2494 if (!info->hdr) 2345 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2495 return -ENOMEM; 2346 return -ENOMEM;
2496 2347
2497 if (copy_from_user(info->hdr, umod, info->len) != 0) { 2348 if (copy_from_user(hdr, umod, len) != 0) {
2498 vfree(info->hdr); 2349 err = -EFAULT;
2499 return -EFAULT; 2350 goto free_hdr;
2500 } 2351 }
2501 2352
2502 return 0; 2353 /* Sanity checks against insmoding binaries or wrong arch,
2503} 2354 weird elf version */
2504 2355 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2505/* Sets info->hdr and info->len. */ 2356 || hdr->e_type != ET_REL
2506static int copy_module_from_fd(int fd, struct load_info *info) 2357 || !elf_check_arch(hdr)
2507{ 2358 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2508 struct file *file; 2359 err = -ENOEXEC;
2509 int err; 2360 goto free_hdr;
2510 struct kstat stat;
2511 loff_t pos;
2512 ssize_t bytes = 0;
2513
2514 file = fget(fd);
2515 if (!file)
2516 return -ENOEXEC;
2517
2518 err = security_kernel_module_from_file(file);
2519 if (err)
2520 goto out;
2521
2522 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
2523 if (err)
2524 goto out;
2525
2526 if (stat.size > INT_MAX) {
2527 err = -EFBIG;
2528 goto out;
2529 }
2530 info->hdr = vmalloc(stat.size);
2531 if (!info->hdr) {
2532 err = -ENOMEM;
2533 goto out;
2534 } 2361 }
2535 2362
2536 pos = 0; 2363 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2537 while (pos < stat.size) { 2364 err = -ENOEXEC;
2538 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, 2365 goto free_hdr;
2539 stat.size - pos);
2540 if (bytes < 0) {
2541 vfree(info->hdr);
2542 err = bytes;
2543 goto out;
2544 }
2545 if (bytes == 0)
2546 break;
2547 pos += bytes;
2548 } 2366 }
2549 info->len = pos;
2550 2367
2551out: 2368 info->hdr = hdr;
2552 fput(file); 2369 info->len = len;
2370 return 0;
2371
2372free_hdr:
2373 vfree(hdr);
2553 return err; 2374 return err;
2554} 2375}
2555 2376
@@ -2558,7 +2379,7 @@ static void free_copy(struct load_info *info)
2558 vfree(info->hdr); 2379 vfree(info->hdr);
2559} 2380}
2560 2381
2561static int rewrite_section_headers(struct load_info *info, int flags) 2382static int rewrite_section_headers(struct load_info *info)
2562{ 2383{
2563 unsigned int i; 2384 unsigned int i;
2564 2385
@@ -2586,10 +2407,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2586 } 2407 }
2587 2408
2588 /* Track but don't keep modinfo and version sections. */ 2409 /* Track but don't keep modinfo and version sections. */
2589 if (flags & MODULE_INIT_IGNORE_MODVERSIONS) 2410 info->index.vers = find_sec(info, "__versions");
2590 info->index.vers = 0; /* Pretend no __versions section! */
2591 else
2592 info->index.vers = find_sec(info, "__versions");
2593 info->index.info = find_sec(info, ".modinfo"); 2411 info->index.info = find_sec(info, ".modinfo");
2594 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2412 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2595 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2413 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2604,7 +2422,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2604 * Return the temporary module pointer (we'll replace it with the final 2422 * Return the temporary module pointer (we'll replace it with the final
2605 * one when we move the module sections around). 2423 * one when we move the module sections around).
2606 */ 2424 */
2607static struct module *setup_load_info(struct load_info *info, int flags) 2425static struct module *setup_load_info(struct load_info *info)
2608{ 2426{
2609 unsigned int i; 2427 unsigned int i;
2610 int err; 2428 int err;
@@ -2615,7 +2433,7 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2615 info->secstrings = (void *)info->hdr 2433 info->secstrings = (void *)info->hdr
2616 + info->sechdrs[info->hdr->e_shstrndx].sh_offset; 2434 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2617 2435
2618 err = rewrite_section_headers(info, flags); 2436 err = rewrite_section_headers(info);
2619 if (err) 2437 if (err)
2620 return ERR_PTR(err); 2438 return ERR_PTR(err);
2621 2439
@@ -2653,14 +2471,11 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2653 return mod; 2471 return mod;
2654} 2472}
2655 2473
2656static int check_modinfo(struct module *mod, struct load_info *info, int flags) 2474static int check_modinfo(struct module *mod, struct load_info *info)
2657{ 2475{
2658 const char *modmagic = get_modinfo(info, "vermagic"); 2476 const char *modmagic = get_modinfo(info, "vermagic");
2659 int err; 2477 int err;
2660 2478
2661 if (flags & MODULE_INIT_IGNORE_VERMAGIC)
2662 modmagic = NULL;
2663
2664 /* This is allowed: modprobe --force will invalidate it. */ 2479 /* This is allowed: modprobe --force will invalidate it. */
2665 if (!modmagic) { 2480 if (!modmagic) {
2666 err = try_to_force_load(mod, "bad vermagic"); 2481 err = try_to_force_load(mod, "bad vermagic");
@@ -2672,9 +2487,6 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2672 return -ENOEXEC; 2487 return -ENOEXEC;
2673 } 2488 }
2674 2489
2675 if (!get_modinfo(info, "intree"))
2676 add_taint_module(mod, TAINT_OOT_MODULE);
2677
2678 if (get_modinfo(info, "staging")) { 2490 if (get_modinfo(info, "staging")) {
2679 add_taint_module(mod, TAINT_CRAP); 2491 add_taint_module(mod, TAINT_CRAP);
2680 printk(KERN_WARNING "%s: module is from the staging directory," 2492 printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2716,7 +2528,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2716 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); 2528 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2717#endif 2529#endif
2718#ifdef CONFIG_CONSTRUCTORS 2530#ifdef CONFIG_CONSTRUCTORS
2719 mod->ctors = section_objs(info, ".ctors", 2531 mod->ctors = section_objs(info, CONFIG_GCOV_CTORS,
2720 sizeof(*mod->ctors), &mod->num_ctors); 2532 sizeof(*mod->ctors), &mod->num_ctors);
2721#endif 2533#endif
2722 2534
@@ -2790,26 +2602,23 @@ static int move_module(struct module *mod, struct load_info *info)
2790 memset(ptr, 0, mod->core_size); 2602 memset(ptr, 0, mod->core_size);
2791 mod->module_core = ptr; 2603 mod->module_core = ptr;
2792 2604
2793 if (mod->init_size) { 2605 ptr = module_alloc_update_bounds(mod->init_size);
2794 ptr = module_alloc_update_bounds(mod->init_size); 2606 /*
2795 /* 2607 * The pointer to this block is stored in the module structure
2796 * The pointer to this block is stored in the module structure 2608 * which is inside the block. This block doesn't need to be
2797 * which is inside the block. This block doesn't need to be 2609 * scanned as it contains data and code that will be freed
2798 * scanned as it contains data and code that will be freed 2610 * after the module is initialized.
2799 * after the module is initialized. 2611 */
2800 */ 2612 kmemleak_ignore(ptr);
2801 kmemleak_ignore(ptr); 2613 if (!ptr && mod->init_size) {
2802 if (!ptr) { 2614 module_free(mod, mod->module_core);
2803 module_free(mod, mod->module_core); 2615 return -ENOMEM;
2804 return -ENOMEM; 2616 }
2805 } 2617 memset(ptr, 0, mod->init_size);
2806 memset(ptr, 0, mod->init_size); 2618 mod->module_init = ptr;
2807 mod->module_init = ptr;
2808 } else
2809 mod->module_init = NULL;
2810 2619
2811 /* Transfer each section which specifies SHF_ALLOC */ 2620 /* Transfer each section which specifies SHF_ALLOC */
2812 pr_debug("final section addresses:\n"); 2621 DEBUGP("final section addresses:\n");
2813 for (i = 0; i < info->hdr->e_shnum; i++) { 2622 for (i = 0; i < info->hdr->e_shnum; i++) {
2814 void *dest; 2623 void *dest;
2815 Elf_Shdr *shdr = &info->sechdrs[i]; 2624 Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2827,8 +2636,8 @@ static int move_module(struct module *mod, struct load_info *info)
2827 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); 2636 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2828 /* Update sh_addr to point to copy in image. */ 2637 /* Update sh_addr to point to copy in image. */
2829 shdr->sh_addr = (unsigned long)dest; 2638 shdr->sh_addr = (unsigned long)dest;
2830 pr_debug("\t0x%lx %s\n", 2639 DEBUGP("\t0x%lx %s\n",
2831 (long)shdr->sh_addr, info->secstrings + shdr->sh_name); 2640 shdr->sh_addr, info->secstrings + shdr->sh_name);
2832 } 2641 }
2833 2642
2834 return 0; 2643 return 0;
@@ -2848,10 +2657,6 @@ static int check_module_license_and_versions(struct module *mod)
2848 if (strcmp(mod->name, "driverloader") == 0) 2657 if (strcmp(mod->name, "driverloader") == 0)
2849 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2658 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2850 2659
2851 /* lve claims to be GPL but upstream won't provide source */
2852 if (strcmp(mod->name, "lve") == 0)
2853 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2854
2855#ifdef CONFIG_MODVERSIONS 2660#ifdef CONFIG_MODVERSIONS
2856 if ((mod->num_syms && !mod->crcs) 2661 if ((mod->num_syms && !mod->crcs)
2857 || (mod->num_gpl_syms && !mod->gpl_crcs) 2662 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2899,18 +2704,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2899 return 0; 2704 return 0;
2900} 2705}
2901 2706
2902static struct module *layout_and_allocate(struct load_info *info, int flags) 2707static struct module *layout_and_allocate(struct load_info *info)
2903{ 2708{
2904 /* Module within temporary copy. */ 2709 /* Module within temporary copy. */
2905 struct module *mod; 2710 struct module *mod;
2906 Elf_Shdr *pcpusec; 2711 Elf_Shdr *pcpusec;
2907 int err; 2712 int err;
2908 2713
2909 mod = setup_load_info(info, flags); 2714 mod = setup_load_info(info);
2910 if (IS_ERR(mod)) 2715 if (IS_ERR(mod))
2911 return mod; 2716 return mod;
2912 2717
2913 err = check_modinfo(mod, info, flags); 2718 err = check_modinfo(mod, info);
2914 if (err) 2719 if (err)
2915 return ERR_PTR(err); 2720 return ERR_PTR(err);
2916 2721
@@ -2934,18 +2739,27 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2934 this is done generically; there doesn't appear to be any 2739 this is done generically; there doesn't appear to be any
2935 special cases for the architectures. */ 2740 special cases for the architectures. */
2936 layout_sections(mod, info); 2741 layout_sections(mod, info);
2742
2743 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2744 * sizeof(long), GFP_KERNEL);
2745 if (!info->strmap) {
2746 err = -ENOMEM;
2747 goto free_percpu;
2748 }
2937 layout_symtab(mod, info); 2749 layout_symtab(mod, info);
2938 2750
2939 /* Allocate and move to the final place */ 2751 /* Allocate and move to the final place */
2940 err = move_module(mod, info); 2752 err = move_module(mod, info);
2941 if (err) 2753 if (err)
2942 goto free_percpu; 2754 goto free_strmap;
2943 2755
2944 /* Module has been copied to its final place now: return it. */ 2756 /* Module has been copied to its final place now: return it. */
2945 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2757 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2946 kmemleak_load_module(mod, info); 2758 kmemleak_load_module(mod, info);
2947 return mod; 2759 return mod;
2948 2760
2761free_strmap:
2762 kfree(info->strmap);
2949free_percpu: 2763free_percpu:
2950 percpu_modfree(mod); 2764 percpu_modfree(mod);
2951out: 2765out:
@@ -2955,6 +2769,7 @@ out:
2955/* mod is no longer valid after this! */ 2769/* mod is no longer valid after this! */
2956static void module_deallocate(struct module *mod, struct load_info *info) 2770static void module_deallocate(struct module *mod, struct load_info *info)
2957{ 2771{
2772 kfree(info->strmap);
2958 percpu_modfree(mod); 2773 percpu_modfree(mod);
2959 module_free(mod, mod->module_init); 2774 module_free(mod, mod->module_init);
2960 module_free(mod, mod->module_core); 2775 module_free(mod, mod->module_core);
@@ -2983,142 +2798,31 @@ static int post_relocation(struct module *mod, const struct load_info *info)
2983 return module_finalize(info->hdr, info->sechdrs, mod); 2798 return module_finalize(info->hdr, info->sechdrs, mod);
2984} 2799}
2985 2800
2986/* Is this module of this name done loading? No locks held. */
2987static bool finished_loading(const char *name)
2988{
2989 struct module *mod;
2990 bool ret;
2991
2992 mutex_lock(&module_mutex);
2993 mod = find_module(name);
2994 ret = !mod || mod->state != MODULE_STATE_COMING;
2995 mutex_unlock(&module_mutex);
2996
2997 return ret;
2998}
2999
3000/* Call module constructors. */
3001static void do_mod_ctors(struct module *mod)
3002{
3003#ifdef CONFIG_CONSTRUCTORS
3004 unsigned long i;
3005
3006 for (i = 0; i < mod->num_ctors; i++)
3007 mod->ctors[i]();
3008#endif
3009}
3010
3011/* This is where the real work happens */
3012static int do_init_module(struct module *mod)
3013{
3014 int ret = 0;
3015
3016 blocking_notifier_call_chain(&module_notify_list,
3017 MODULE_STATE_COMING, mod);
3018
3019 /* Set RO and NX regions for core */
3020 set_section_ro_nx(mod->module_core,
3021 mod->core_text_size,
3022 mod->core_ro_size,
3023 mod->core_size);
3024
3025 /* Set RO and NX regions for init */
3026 set_section_ro_nx(mod->module_init,
3027 mod->init_text_size,
3028 mod->init_ro_size,
3029 mod->init_size);
3030
3031 do_mod_ctors(mod);
3032 /* Start the module */
3033 if (mod->init != NULL)
3034 ret = do_one_initcall(mod->init);
3035 if (ret < 0) {
3036 /* Init routine failed: abort. Try to protect us from
3037 buggy refcounters. */
3038 mod->state = MODULE_STATE_GOING;
3039 synchronize_sched();
3040 module_put(mod);
3041 blocking_notifier_call_chain(&module_notify_list,
3042 MODULE_STATE_GOING, mod);
3043 free_module(mod);
3044 wake_up_all(&module_wq);
3045 return ret;
3046 }
3047 if (ret > 0) {
3048 printk(KERN_WARNING
3049"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3050"%s: loading module anyway...\n",
3051 __func__, mod->name, ret,
3052 __func__);
3053 dump_stack();
3054 }
3055
3056 /* Now it's a first class citizen! */
3057 mod->state = MODULE_STATE_LIVE;
3058 blocking_notifier_call_chain(&module_notify_list,
3059 MODULE_STATE_LIVE, mod);
3060
3061 /* We need to finish all async code before the module init sequence is done */
3062 async_synchronize_full();
3063
3064 mutex_lock(&module_mutex);
3065 /* Drop initial reference. */
3066 module_put(mod);
3067 trim_init_extable(mod);
3068#ifdef CONFIG_KALLSYMS
3069 mod->num_symtab = mod->core_num_syms;
3070 mod->symtab = mod->core_symtab;
3071 mod->strtab = mod->core_strtab;
3072#endif
3073 unset_module_init_ro_nx(mod);
3074 module_free(mod, mod->module_init);
3075 mod->module_init = NULL;
3076 mod->init_size = 0;
3077 mod->init_ro_size = 0;
3078 mod->init_text_size = 0;
3079 mutex_unlock(&module_mutex);
3080 wake_up_all(&module_wq);
3081
3082 return 0;
3083}
3084
3085static int may_init_module(void)
3086{
3087 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3088 return -EPERM;
3089
3090 return 0;
3091}
3092
3093/* Allocate and load the module: note that size of section 0 is always 2801/* Allocate and load the module: note that size of section 0 is always
3094 zero, and we rely on this for optional sections. */ 2802 zero, and we rely on this for optional sections. */
3095static int load_module(struct load_info *info, const char __user *uargs, 2803static struct module *load_module(void __user *umod,
3096 int flags) 2804 unsigned long len,
2805 const char __user *uargs)
3097{ 2806{
3098 struct module *mod, *old; 2807 struct load_info info = { NULL, };
2808 struct module *mod;
3099 long err; 2809 long err;
3100 2810
3101 err = module_sig_check(info); 2811 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
3102 if (err) 2812 umod, len, uargs);
3103 goto free_copy;
3104 2813
3105 err = elf_header_check(info); 2814 /* Copy in the blobs from userspace, check they are vaguely sane. */
2815 err = copy_and_check(&info, umod, len, uargs);
3106 if (err) 2816 if (err)
3107 goto free_copy; 2817 return ERR_PTR(err);
3108 2818
3109 /* Figure out module layout, and allocate all the memory. */ 2819 /* Figure out module layout, and allocate all the memory. */
3110 mod = layout_and_allocate(info, flags); 2820 mod = layout_and_allocate(&info);
3111 if (IS_ERR(mod)) { 2821 if (IS_ERR(mod)) {
3112 err = PTR_ERR(mod); 2822 err = PTR_ERR(mod);
3113 goto free_copy; 2823 goto free_copy;
3114 } 2824 }
3115 2825
3116#ifdef CONFIG_MODULE_SIG
3117 mod->sig_ok = info->sig_ok;
3118 if (!mod->sig_ok)
3119 add_taint_module(mod, TAINT_FORCED_MODULE);
3120#endif
3121
3122 /* Now module is in final location, initialize linked lists, etc. */ 2826 /* Now module is in final location, initialize linked lists, etc. */
3123 err = module_unload_init(mod); 2827 err = module_unload_init(mod);
3124 if (err) 2828 if (err)
@@ -3126,25 +2830,25 @@ static int load_module(struct load_info *info, const char __user *uargs,
3126 2830
3127 /* Now we've got everything in the final locations, we can 2831 /* Now we've got everything in the final locations, we can
3128 * find optional sections. */ 2832 * find optional sections. */
3129 find_module_sections(mod, info); 2833 find_module_sections(mod, &info);
3130 2834
3131 err = check_module_license_and_versions(mod); 2835 err = check_module_license_and_versions(mod);
3132 if (err) 2836 if (err)
3133 goto free_unload; 2837 goto free_unload;
3134 2838
3135 /* Set up MODINFO_ATTR fields */ 2839 /* Set up MODINFO_ATTR fields */
3136 setup_modinfo(mod, info); 2840 setup_modinfo(mod, &info);
3137 2841
3138 /* Fix up syms, so that st_value is a pointer to location. */ 2842 /* Fix up syms, so that st_value is a pointer to location. */
3139 err = simplify_symbols(mod, info); 2843 err = simplify_symbols(mod, &info);
3140 if (err < 0) 2844 if (err < 0)
3141 goto free_modinfo; 2845 goto free_modinfo;
3142 2846
3143 err = apply_relocations(mod, info); 2847 err = apply_relocations(mod, &info);
3144 if (err < 0) 2848 if (err < 0)
3145 goto free_modinfo; 2849 goto free_modinfo;
3146 2850
3147 err = post_relocation(mod, info); 2851 err = post_relocation(mod, &info);
3148 if (err < 0) 2852 if (err < 0)
3149 goto free_modinfo; 2853 goto free_modinfo;
3150 2854
@@ -3167,61 +2871,52 @@ static int load_module(struct load_info *info, const char __user *uargs,
3167 * function to insert in a way safe to concurrent readers. 2871 * function to insert in a way safe to concurrent readers.
3168 * The mutex protects against concurrent writers. 2872 * The mutex protects against concurrent writers.
3169 */ 2873 */
3170again:
3171 mutex_lock(&module_mutex); 2874 mutex_lock(&module_mutex);
3172 if ((old = find_module(mod->name)) != NULL) { 2875 if (find_module(mod->name)) {
3173 if (old->state == MODULE_STATE_COMING) {
3174 /* Wait in case it fails to load. */
3175 mutex_unlock(&module_mutex);
3176 err = wait_event_interruptible(module_wq,
3177 finished_loading(mod->name));
3178 if (err)
3179 goto free_arch_cleanup;
3180 goto again;
3181 }
3182 err = -EEXIST; 2876 err = -EEXIST;
3183 goto unlock; 2877 goto unlock;
3184 } 2878 }
3185 2879
3186 /* This has to be done once we're sure module name is unique. */ 2880 /* This has to be done once we're sure module name is unique. */
3187 dynamic_debug_setup(info->debug, info->num_debug); 2881 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2882 dynamic_debug_setup(info.debug, info.num_debug);
3188 2883
3189 /* Find duplicate symbols */ 2884 /* Find duplicate symbols */
3190 err = verify_export_symbols(mod); 2885 err = verify_export_symbols(mod);
3191 if (err < 0) 2886 if (err < 0)
3192 goto ddebug; 2887 goto ddebug;
3193 2888
3194 module_bug_finalize(info->hdr, info->sechdrs, mod); 2889 module_bug_finalize(info.hdr, info.sechdrs, mod);
3195 list_add_rcu(&mod->list, &modules); 2890 list_add_rcu(&mod->list, &modules);
3196 mutex_unlock(&module_mutex); 2891 mutex_unlock(&module_mutex);
3197 2892
3198 /* Module is ready to execute: parsing args may do that. */ 2893 /* Module is ready to execute: parsing args may do that. */
3199 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 2894 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
3200 -32768, 32767, &ddebug_dyndbg_module_param_cb);
3201 if (err < 0) 2895 if (err < 0)
3202 goto unlink; 2896 goto unlink;
3203 2897
3204 /* Link in to syfs. */ 2898 /* Link in to syfs. */
3205 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); 2899 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
3206 if (err < 0) 2900 if (err < 0)
3207 goto unlink; 2901 goto unlink;
3208 2902
3209 /* Get rid of temporary copy. */ 2903 /* Get rid of temporary copy and strmap. */
3210 free_copy(info); 2904 kfree(info.strmap);
2905 free_copy(&info);
3211 2906
3212 /* Done! */ 2907 /* Done! */
3213 trace_module_load(mod); 2908 trace_module_load(mod);
3214 2909 return mod;
3215 return do_init_module(mod);
3216 2910
3217 unlink: 2911 unlink:
3218 mutex_lock(&module_mutex); 2912 mutex_lock(&module_mutex);
3219 /* Unlink carefully: kallsyms could be walking list. */ 2913 /* Unlink carefully: kallsyms could be walking list. */
3220 list_del_rcu(&mod->list); 2914 list_del_rcu(&mod->list);
3221 module_bug_cleanup(mod); 2915 module_bug_cleanup(mod);
3222 wake_up_all(&module_wq); 2916
3223 ddebug: 2917 ddebug:
3224 dynamic_debug_remove(info->debug); 2918 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2919 dynamic_debug_remove(info.debug);
3225 unlock: 2920 unlock:
3226 mutex_unlock(&module_mutex); 2921 mutex_unlock(&module_mutex);
3227 synchronize_sched(); 2922 synchronize_sched();
@@ -3233,52 +2928,106 @@ again:
3233 free_unload: 2928 free_unload:
3234 module_unload_free(mod); 2929 module_unload_free(mod);
3235 free_module: 2930 free_module:
3236 module_deallocate(mod, info); 2931 module_deallocate(mod, &info);
3237 free_copy: 2932 free_copy:
3238 free_copy(info); 2933 free_copy(&info);
3239 return err; 2934 return ERR_PTR(err);
3240} 2935}
3241 2936
2937/* Call module constructors. */
2938static void do_mod_ctors(struct module *mod)
2939{
2940#ifdef CONFIG_CONSTRUCTORS
2941 unsigned long i;
2942
2943 for (i = 0; i < mod->num_ctors; i++)
2944 mod->ctors[i]();
2945#endif
2946}
2947
2948/* This is where the real work happens */
3242SYSCALL_DEFINE3(init_module, void __user *, umod, 2949SYSCALL_DEFINE3(init_module, void __user *, umod,
3243 unsigned long, len, const char __user *, uargs) 2950 unsigned long, len, const char __user *, uargs)
3244{ 2951{
3245 int err; 2952 struct module *mod;
3246 struct load_info info = { }; 2953 int ret = 0;
3247 2954
3248 err = may_init_module(); 2955 /* Must have permission */
3249 if (err) 2956 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3250 return err; 2957 return -EPERM;
3251 2958
3252 pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", 2959 /* Do all the hard work */
3253 umod, len, uargs); 2960 mod = load_module(umod, len, uargs);
2961 if (IS_ERR(mod))
2962 return PTR_ERR(mod);
3254 2963
3255 err = copy_module_from_user(umod, len, &info); 2964 blocking_notifier_call_chain(&module_notify_list,
3256 if (err) 2965 MODULE_STATE_COMING, mod);
3257 return err;
3258 2966
3259 return load_module(&info, uargs, 0); 2967 /* Set RO and NX regions for core */
3260} 2968 set_section_ro_nx(mod->module_core,
2969 mod->core_text_size,
2970 mod->core_ro_size,
2971 mod->core_size);
3261 2972
3262SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) 2973 /* Set RO and NX regions for init */
3263{ 2974 set_section_ro_nx(mod->module_init,
3264 int err; 2975 mod->init_text_size,
3265 struct load_info info = { }; 2976 mod->init_ro_size,
2977 mod->init_size);
3266 2978
3267 err = may_init_module(); 2979 do_mod_ctors(mod);
3268 if (err) 2980 /* Start the module */
3269 return err; 2981 if (mod->init != NULL)
2982 ret = do_one_initcall(mod->init);
2983 if (ret < 0) {
2984 /* Init routine failed: abort. Try to protect us from
2985 buggy refcounters. */
2986 mod->state = MODULE_STATE_GOING;
2987 synchronize_sched();
2988 module_put(mod);
2989 blocking_notifier_call_chain(&module_notify_list,
2990 MODULE_STATE_GOING, mod);
2991 free_module(mod);
2992 wake_up(&module_wq);
2993 return ret;
2994 }
2995 if (ret > 0) {
2996 printk(KERN_WARNING
2997"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2998"%s: loading module anyway...\n",
2999 __func__, mod->name, ret,
3000 __func__);
3001 dump_stack();
3002 }
3270 3003
3271 pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); 3004 /* Now it's a first class citizen! Wake up anyone waiting for it. */
3005 mod->state = MODULE_STATE_LIVE;
3006 wake_up(&module_wq);
3007 blocking_notifier_call_chain(&module_notify_list,
3008 MODULE_STATE_LIVE, mod);
3272 3009
3273 if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS 3010 /* We need to finish all async code before the module init sequence is done */
3274 |MODULE_INIT_IGNORE_VERMAGIC)) 3011 async_synchronize_full();
3275 return -EINVAL;
3276 3012
3277 err = copy_module_from_fd(fd, &info); 3013 mutex_lock(&module_mutex);
3278 if (err) 3014 /* Drop initial reference. */
3279 return err; 3015 module_put(mod);
3016 trim_init_extable(mod);
3017#ifdef CONFIG_KALLSYMS
3018 mod->num_symtab = mod->core_num_syms;
3019 mod->symtab = mod->core_symtab;
3020 mod->strtab = mod->core_strtab;
3021#endif
3022 unset_module_init_ro_nx(mod);
3023 module_free(mod, mod->module_init);
3024 mod->module_init = NULL;
3025 mod->init_size = 0;
3026 mod->init_ro_size = 0;
3027 mod->init_text_size = 0;
3028 mutex_unlock(&module_mutex);
3280 3029
3281 return load_module(&info, uargs, flags); 3030 return 0;
3282} 3031}
3283 3032
3284static inline int within(unsigned long addr, void *start, unsigned long size) 3033static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3506,7 +3255,18 @@ static char *module_flags(struct module *mod, char *buf)
3506 mod->state == MODULE_STATE_GOING || 3255 mod->state == MODULE_STATE_GOING ||
3507 mod->state == MODULE_STATE_COMING) { 3256 mod->state == MODULE_STATE_COMING) {
3508 buf[bx++] = '('; 3257 buf[bx++] = '(';
3509 bx += module_flags_taint(mod, buf + bx); 3258 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
3259 buf[bx++] = 'P';
3260 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3261 buf[bx++] = 'F';
3262 if (mod->taints & (1 << TAINT_CRAP))
3263 buf[bx++] = 'C';
3264 /*
3265 * TAINT_FORCED_RMMOD: could be added.
3266 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
3267 * apply to modules.
3268 */
3269
3510 /* Show a - for module-is-being-unloaded */ 3270 /* Show a - for module-is-being-unloaded */
3511 if (mod->state == MODULE_STATE_GOING) 3271 if (mod->state == MODULE_STATE_GOING)
3512 buf[bx++] = '-'; 3272 buf[bx++] = '-';
@@ -3727,3 +3487,50 @@ void module_layout(struct module *mod,
3727} 3487}
3728EXPORT_SYMBOL(module_layout); 3488EXPORT_SYMBOL(module_layout);
3729#endif 3489#endif
3490
3491#ifdef CONFIG_TRACEPOINTS
3492void module_update_tracepoints(void)
3493{
3494 struct module *mod;
3495
3496 mutex_lock(&module_mutex);
3497 list_for_each_entry(mod, &modules, list)
3498 if (!mod->taints)
3499 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3500 mod->tracepoints_ptrs + mod->num_tracepoints);
3501 mutex_unlock(&module_mutex);
3502}
3503
3504/*
3505 * Returns 0 if current not found.
3506 * Returns 1 if current found.
3507 */
3508int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3509{
3510 struct module *iter_mod;
3511 int found = 0;
3512
3513 mutex_lock(&module_mutex);
3514 list_for_each_entry(iter_mod, &modules, list) {
3515 if (!iter_mod->taints) {
3516 /*
3517 * Sorted module list
3518 */
3519 if (iter_mod < iter->module)
3520 continue;
3521 else if (iter_mod > iter->module)
3522 iter->tracepoint = NULL;
3523 found = tracepoint_get_iter_range(&iter->tracepoint,
3524 iter_mod->tracepoints_ptrs,
3525 iter_mod->tracepoints_ptrs
3526 + iter_mod->num_tracepoints);
3527 if (found) {
3528 iter->module = iter_mod;
3529 break;
3530 }
3531 }
3532 }
3533 mutex_unlock(&module_mutex);
3534 return found;
3535}
3536#endif
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index f2970bddc5e..00000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,249 +0,0 @@
1/* Module signature checker
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/err.h>
14#include <crypto/public_key.h>
15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19/*
20 * Module signature information block.
21 *
22 * The constituents of the signature section are, in order:
23 *
24 * - Signer's name
25 * - Key identifier
26 * - Signature data
27 * - Information block
28 */
29struct module_signature {
30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */
32 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */
35 u8 __pad[3];
36 __be32 sig_len; /* Length of signature data */
37};
38
39/*
40 * Digest the module contents.
41 */
42static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
43 const void *mod,
44 unsigned long modlen)
45{
46 struct public_key_signature *pks;
47 struct crypto_shash *tfm;
48 struct shash_desc *desc;
49 size_t digest_size, desc_size;
50 int ret;
51
52 pr_devel("==>%s()\n", __func__);
53
54 /* Allocate the hashing algorithm we're going to need and find out how
55 * big the hash operational data will be.
56 */
57 tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
58 if (IS_ERR(tfm))
59 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
60
61 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
62 digest_size = crypto_shash_digestsize(tfm);
63
64 /* We allocate the hash operational data storage on the end of our
65 * context data and the digest output buffer on the end of that.
66 */
67 ret = -ENOMEM;
68 pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
69 if (!pks)
70 goto error_no_pks;
71
72 pks->pkey_hash_algo = hash;
73 pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
74 pks->digest_size = digest_size;
75
76 desc = (void *)pks + sizeof(*pks);
77 desc->tfm = tfm;
78 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
79
80 ret = crypto_shash_init(desc);
81 if (ret < 0)
82 goto error;
83
84 ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
85 if (ret < 0)
86 goto error;
87
88 crypto_free_shash(tfm);
89 pr_devel("<==%s() = ok\n", __func__);
90 return pks;
91
92error:
93 kfree(pks);
94error_no_pks:
95 crypto_free_shash(tfm);
96 pr_devel("<==%s() = %d\n", __func__, ret);
97 return ERR_PTR(ret);
98}
99
100/*
101 * Extract an MPI array from the signature data. This represents the actual
102 * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
103 * size of the MPI in bytes.
104 *
105 * RSA signatures only have one MPI, so currently we only read one.
106 */
107static int mod_extract_mpi_array(struct public_key_signature *pks,
108 const void *data, size_t len)
109{
110 size_t nbytes;
111 MPI mpi;
112
113 if (len < 3)
114 return -EBADMSG;
115 nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
116 data += 2;
117 len -= 2;
118 if (len != nbytes)
119 return -EBADMSG;
120
121 mpi = mpi_read_raw_data(data, nbytes);
122 if (!mpi)
123 return -ENOMEM;
124 pks->mpi[0] = mpi;
125 pks->nr_mpi = 1;
126 return 0;
127}
128
129/*
130 * Request an asymmetric key.
131 */
132static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
133 const u8 *key_id, size_t key_id_len)
134{
135 key_ref_t key;
136 size_t i;
137 char *id, *q;
138
139 pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
140
141 /* Construct an identifier. */
142 id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
143 if (!id)
144 return ERR_PTR(-ENOKEY);
145
146 memcpy(id, signer, signer_len);
147
148 q = id + signer_len;
149 *q++ = ':';
150 *q++ = ' ';
151 for (i = 0; i < key_id_len; i++) {
152 *q++ = hex_asc[*key_id >> 4];
153 *q++ = hex_asc[*key_id++ & 0x0f];
154 }
155
156 *q = 0;
157
158 pr_debug("Look up: \"%s\"\n", id);
159
160 key = keyring_search(make_key_ref(modsign_keyring, 1),
161 &key_type_asymmetric, id);
162 if (IS_ERR(key))
163 pr_warn("Request for unknown module key '%s' err %ld\n",
164 id, PTR_ERR(key));
165 kfree(id);
166
167 if (IS_ERR(key)) {
168 switch (PTR_ERR(key)) {
169 /* Hide some search errors */
170 case -EACCES:
171 case -ENOTDIR:
172 case -EAGAIN:
173 return ERR_PTR(-ENOKEY);
174 default:
175 return ERR_CAST(key);
176 }
177 }
178
179 pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
180 return key_ref_to_ptr(key);
181}
182
183/*
184 * Verify the signature on a module.
185 */
186int mod_verify_sig(const void *mod, unsigned long *_modlen)
187{
188 struct public_key_signature *pks;
189 struct module_signature ms;
190 struct key *key;
191 const void *sig;
192 size_t modlen = *_modlen, sig_len;
193 int ret;
194
195 pr_devel("==>%s(,%zu)\n", __func__, modlen);
196
197 if (modlen <= sizeof(ms))
198 return -EBADMSG;
199
200 memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
201 modlen -= sizeof(ms);
202
203 sig_len = be32_to_cpu(ms.sig_len);
204 if (sig_len >= modlen)
205 return -EBADMSG;
206 modlen -= sig_len;
207 if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
208 return -EBADMSG;
209 modlen -= (size_t)ms.signer_len + ms.key_id_len;
210
211 *_modlen = modlen;
212 sig = mod + modlen;
213
214 /* For the moment, only support RSA and X.509 identifiers */
215 if (ms.algo != PKEY_ALGO_RSA ||
216 ms.id_type != PKEY_ID_X509)
217 return -ENOPKG;
218
219 if (ms.hash >= PKEY_HASH__LAST ||
220 !pkey_hash_algo[ms.hash])
221 return -ENOPKG;
222
223 key = request_asymmetric_key(sig, ms.signer_len,
224 sig + ms.signer_len, ms.key_id_len);
225 if (IS_ERR(key))
226 return PTR_ERR(key);
227
228 pks = mod_make_digest(ms.hash, mod, modlen);
229 if (IS_ERR(pks)) {
230 ret = PTR_ERR(pks);
231 goto error_put_key;
232 }
233
234 ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
235 sig_len);
236 if (ret < 0)
237 goto error_free_pks;
238
239 ret = verify_signature(key, pks);
240 pr_devel("verify_signature() = %d\n", ret);
241
242error_free_pks:
243 mpi_free(pks->rsa.s);
244 kfree(pks);
245error_put_key:
246 key_put(key);
247 pr_devel("<==%s() = %d\n", __func__, ret);
248 return ret;
249}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 7e3443fe1f4..73da83aff41 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
14 */ 14 */
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/export.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c952..d607ed5dd44 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/export.h> 22#include <linux/module.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/debug_locks.h> 25#include <linux/debug_locks.h>
@@ -240,7 +240,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
240 240
241 /* didn't get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
242 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
243 schedule_preempt_disabled(); 243 preempt_enable_no_resched();
244 schedule();
245 preempt_disable();
244 spin_lock_mutex(&lock->wait_lock, flags); 246 spin_lock_mutex(&lock->wait_lock, flags);
245 } 247 }
246 248
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7..8d7b435806c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
1#include <linux/kdebug.h> 1#include <linux/kdebug.h>
2#include <linux/kprobes.h> 2#include <linux/kprobes.h>
3#include <linux/export.h> 3#include <linux/module.h>
4#include <linux/notifier.h> 4#include <linux/notifier.h>
5#include <linux/rcupdate.h> 5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb2016..9aeab4b98c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
14 */ 14 */
15 15
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/export.h> 17#include <linux/module.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
@@ -57,8 +57,7 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct user_namespace *user_ns, 60 struct task_struct *tsk, struct fs_struct *new_fs)
61 struct fs_struct *new_fs)
62{ 61{
63 struct nsproxy *new_nsp; 62 struct nsproxy *new_nsp;
64 int err; 63 int err;
@@ -67,31 +66,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
67 if (!new_nsp) 66 if (!new_nsp)
68 return ERR_PTR(-ENOMEM); 67 return ERR_PTR(-ENOMEM);
69 68
70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); 69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
71 if (IS_ERR(new_nsp->mnt_ns)) { 70 if (IS_ERR(new_nsp->mnt_ns)) {
72 err = PTR_ERR(new_nsp->mnt_ns); 71 err = PTR_ERR(new_nsp->mnt_ns);
73 goto out_ns; 72 goto out_ns;
74 } 73 }
75 74
76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); 75 new_nsp->uts_ns = copy_utsname(flags, tsk);
77 if (IS_ERR(new_nsp->uts_ns)) { 76 if (IS_ERR(new_nsp->uts_ns)) {
78 err = PTR_ERR(new_nsp->uts_ns); 77 err = PTR_ERR(new_nsp->uts_ns);
79 goto out_uts; 78 goto out_uts;
80 } 79 }
81 80
82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); 81 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
83 if (IS_ERR(new_nsp->ipc_ns)) { 82 if (IS_ERR(new_nsp->ipc_ns)) {
84 err = PTR_ERR(new_nsp->ipc_ns); 83 err = PTR_ERR(new_nsp->ipc_ns);
85 goto out_ipc; 84 goto out_ipc;
86 } 85 }
87 86
88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
89 if (IS_ERR(new_nsp->pid_ns)) { 88 if (IS_ERR(new_nsp->pid_ns)) {
90 err = PTR_ERR(new_nsp->pid_ns); 89 err = PTR_ERR(new_nsp->pid_ns);
91 goto out_pid; 90 goto out_pid;
92 } 91 }
93 92
94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); 93 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
95 if (IS_ERR(new_nsp->net_ns)) { 94 if (IS_ERR(new_nsp->net_ns)) {
96 err = PTR_ERR(new_nsp->net_ns); 95 err = PTR_ERR(new_nsp->net_ns);
97 goto out_net; 96 goto out_net;
@@ -123,7 +122,6 @@ out_ns:
123int copy_namespaces(unsigned long flags, struct task_struct *tsk) 122int copy_namespaces(unsigned long flags, struct task_struct *tsk)
124{ 123{
125 struct nsproxy *old_ns = tsk->nsproxy; 124 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
127 struct nsproxy *new_ns; 125 struct nsproxy *new_ns;
128 int err = 0; 126 int err = 0;
129 127
@@ -136,7 +134,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
136 CLONE_NEWPID | CLONE_NEWNET))) 134 CLONE_NEWPID | CLONE_NEWNET)))
137 return 0; 135 return 0;
138 136
139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { 137 if (!capable(CAP_SYS_ADMIN)) {
140 err = -EPERM; 138 err = -EPERM;
141 goto out; 139 goto out;
142 } 140 }
@@ -153,8 +151,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
153 goto out; 151 goto out;
154 } 152 }
155 153
156 new_ns = create_new_namespaces(flags, tsk, 154 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
157 task_cred_xxx(tsk, user_ns), tsk->fs);
158 if (IS_ERR(new_ns)) { 155 if (IS_ERR(new_ns)) {
159 err = PTR_ERR(new_ns); 156 err = PTR_ERR(new_ns);
160 goto out; 157 goto out;
@@ -186,21 +183,19 @@ void free_nsproxy(struct nsproxy *ns)
186 * On success, returns the new nsproxy. 183 * On success, returns the new nsproxy.
187 */ 184 */
188int unshare_nsproxy_namespaces(unsigned long unshare_flags, 185int unshare_nsproxy_namespaces(unsigned long unshare_flags,
189 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) 186 struct nsproxy **new_nsp, struct fs_struct *new_fs)
190{ 187{
191 struct user_namespace *user_ns;
192 int err = 0; 188 int err = 0;
193 189
194 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
195 CLONE_NEWNET | CLONE_NEWPID))) 191 CLONE_NEWNET)))
196 return 0; 192 return 0;
197 193
198 user_ns = new_cred ? new_cred->user_ns : current_user_ns(); 194 if (!capable(CAP_SYS_ADMIN))
199 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
200 return -EPERM; 195 return -EPERM;
201 196
202 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, 197 *new_nsp = create_new_namespaces(unshare_flags, current,
203 new_fs ? new_fs : current->fs); 198 new_fs ? new_fs : current->fs);
204 if (IS_ERR(*new_nsp)) { 199 if (IS_ERR(*new_nsp)) {
205 err = PTR_ERR(*new_nsp); 200 err = PTR_ERR(*new_nsp);
206 goto out; 201 goto out;
@@ -246,6 +241,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
246 struct file *file; 241 struct file *file;
247 int err; 242 int err;
248 243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
249 file = proc_ns_fget(fd); 247 file = proc_ns_fget(fd);
250 if (IS_ERR(file)) 248 if (IS_ERR(file))
251 return PTR_ERR(file); 249 return PTR_ERR(file);
@@ -256,7 +254,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
256 if (nstype && (ops->type != nstype)) 254 if (nstype && (ops->type != nstype))
257 goto out; 255 goto out;
258 256
259 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
260 if (IS_ERR(new_nsproxy)) { 258 if (IS_ERR(new_nsproxy)) {
261 err = PTR_ERR(new_nsproxy); 259 err = PTR_ERR(new_nsproxy);
262 goto out; 260 goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 072f4ee4eb8..b91941df5e6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * padata.c - generic interface to process data streams in parallel 2 * padata.c - generic interface to process data streams in parallel
3 * 3 *
4 * See Documentation/padata.txt for an api documentation.
5 *
6 * Copyright (C) 2008, 2009 secunet Security Networks AG 4 * Copyright (C) 2008, 2009 secunet Security Networks AG
7 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> 5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
8 * 6 *
@@ -20,7 +18,7 @@
20 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
21 */ 19 */
22 20
23#include <linux/export.h> 21#include <linux/module.h>
24#include <linux/cpumask.h> 22#include <linux/cpumask.h>
25#include <linux/err.h> 23#include <linux/err.h>
26#include <linux/cpu.h> 24#include <linux/cpu.h>
@@ -31,6 +29,7 @@
31#include <linux/sysfs.h> 29#include <linux/sysfs.h>
32#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
33 31
32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
34#define MAX_OBJ_NUM 1000 33#define MAX_OBJ_NUM 1000
35 34
36static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -44,19 +43,18 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
44 return target_cpu; 43 return target_cpu;
45} 44}
46 45
47static int padata_cpu_hash(struct parallel_data *pd) 46static int padata_cpu_hash(struct padata_priv *padata)
48{ 47{
49 int cpu_index; 48 int cpu_index;
49 struct parallel_data *pd;
50
51 pd = padata->pd;
50 52
51 /* 53 /*
52 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
54 */ 56 */
55 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 spin_lock(&pd->seq_lock);
57 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
58 pd->seq_nr++;
59 spin_unlock(&pd->seq_lock);
60 58
61 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
62} 60}
@@ -134,7 +132,12 @@ int padata_do_parallel(struct padata_instance *pinst,
134 padata->pd = pd; 132 padata->pd = pd;
135 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
136 134
137 target_cpu = padata_cpu_hash(pd); 135 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
136 atomic_set(&pd->seq_nr, -1);
137
138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
139
140 target_cpu = padata_cpu_hash(padata);
138 queue = per_cpu_ptr(pd->pqueue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
139 142
140 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
@@ -170,8 +173,8 @@ EXPORT_SYMBOL(padata_do_parallel);
170static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
171{ 174{
172 int cpu, num_cpus; 175 int cpu, num_cpus;
173 unsigned int next_nr, next_index; 176 int next_nr, next_index;
174 struct padata_parallel_queue *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
175 struct padata_priv *padata; 178 struct padata_priv *padata;
176 struct padata_list *reorder; 179 struct padata_list *reorder;
177 180
@@ -186,6 +189,14 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
186 cpu = padata_index_to_cpu(pd, next_index); 189 cpu = padata_index_to_cpu(pd, next_index);
187 next_queue = per_cpu_ptr(pd->pqueue, cpu); 190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
188 191
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
198 }
199
189 padata = NULL; 200 padata = NULL;
190 201
191 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
@@ -194,6 +205,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
194 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
195 struct padata_priv, list); 206 struct padata_priv, list);
196 207
208 BUG_ON(next_nr != padata->seq_nr);
209
197 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
198 list_del_init(&padata->list); 211 list_del_init(&padata->list);
199 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
@@ -204,7 +217,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
204 goto out; 217 goto out;
205 } 218 }
206 219
207 if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
221 if (queue->cpu_index == next_queue->cpu_index) {
208 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
209 goto out; 223 goto out;
210 } 224 }
@@ -216,7 +230,6 @@ out:
216 230
217static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
218{ 232{
219 int cb_cpu;
220 struct padata_priv *padata; 233 struct padata_priv *padata;
221 struct padata_serial_queue *squeue; 234 struct padata_serial_queue *squeue;
222 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
@@ -257,14 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
257 return; 270 return;
258 } 271 }
259 272
260 cb_cpu = padata->cb_cpu; 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
261 squeue = per_cpu_ptr(pd->squeue, cb_cpu);
262 274
263 spin_lock(&squeue->serial.lock); 275 spin_lock(&squeue->serial.lock);
264 list_add_tail(&padata->list, &squeue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
265 spin_unlock(&squeue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
266 278
267 queue_work_on(cb_cpu, pinst->wq, &squeue->work); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
268 } 280 }
269 281
270 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
@@ -355,13 +367,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
355 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
356 return -ENOMEM; 368 return -ENOMEM;
357 369
358 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
359 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { 371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
360 free_cpumask_var(pd->cpumask.cbcpu); 372 free_cpumask_var(pd->cpumask.cbcpu);
361 return -ENOMEM; 373 return -ENOMEM;
362 } 374 }
363 375
364 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
365 return 0; 377 return 0;
366} 378}
367 379
@@ -388,7 +400,7 @@ static void padata_init_squeues(struct parallel_data *pd)
388/* Initialize all percpu queues used by parallel workers */ 400/* Initialize all percpu queues used by parallel workers */
389static void padata_init_pqueues(struct parallel_data *pd) 401static void padata_init_pqueues(struct parallel_data *pd)
390{ 402{
391 int cpu_index, cpu; 403 int cpu_index, num_cpus, cpu;
392 struct padata_parallel_queue *pqueue; 404 struct padata_parallel_queue *pqueue;
393 405
394 cpu_index = 0; 406 cpu_index = 0;
@@ -403,6 +415,9 @@ static void padata_init_pqueues(struct parallel_data *pd)
403 INIT_WORK(&pqueue->work, padata_parallel_worker); 415 INIT_WORK(&pqueue->work, padata_parallel_worker);
404 atomic_set(&pqueue->num_obj, 0); 416 atomic_set(&pqueue->num_obj, 0);
405 } 417 }
418
419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
406} 421}
407 422
408/* Allocate and initialize the internal cpumask dependend resources. */ 423/* Allocate and initialize the internal cpumask dependend resources. */
@@ -429,7 +444,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
429 padata_init_pqueues(pd); 444 padata_init_pqueues(pd);
430 padata_init_squeues(pd); 445 padata_init_squeues(pd);
431 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
432 pd->seq_nr = 0; 447 atomic_set(&pd->seq_nr, -1);
433 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
434 atomic_set(&pd->refcnt, 0); 449 atomic_set(&pd->refcnt, 0);
435 pd->pinst = pinst; 450 pd->pinst = pinst;
@@ -565,7 +580,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
565static bool padata_validate_cpumask(struct padata_instance *pinst, 580static bool padata_validate_cpumask(struct padata_instance *pinst,
566 const struct cpumask *cpumask) 581 const struct cpumask *cpumask)
567{ 582{
568 if (!cpumask_intersects(cpumask, cpu_online_mask)) { 583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
569 pinst->flags |= PADATA_INVALID; 584 pinst->flags |= PADATA_INVALID;
570 return false; 585 return false;
571 } 586 }
@@ -679,7 +694,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
679{ 694{
680 struct parallel_data *pd; 695 struct parallel_data *pd;
681 696
682 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
683 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
684 pinst->cpumask.cbcpu); 699 pinst->cpumask.cbcpu);
685 if (!pd) 700 if (!pd)
@@ -747,9 +762,6 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
747 return -ENOMEM; 762 return -ENOMEM;
748 763
749 padata_replace(pinst, pd); 764 padata_replace(pinst, pd);
750
751 cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
752 cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
753 } 765 }
754 766
755 return 0; 767 return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff9..41fc78ea3db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,13 +27,19 @@
27#define PANIC_TIMER_STEP 100 27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 28#define PANIC_BLINK_SPD 18
29 29
30int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; 30/* Machine specific panic information string */
31char *mach_panic_string;
32
33int panic_on_oops;
31static unsigned long tainted_mask; 34static unsigned long tainted_mask;
32static int pause_on_oops; 35static int pause_on_oops;
33static int pause_on_oops_flag; 36static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 37static DEFINE_SPINLOCK(pause_on_oops_lock);
35 38
36int panic_timeout; 39#ifndef CONFIG_PANIC_TIMEOUT
40#define CONFIG_PANIC_TIMEOUT 0
41#endif
42int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 43EXPORT_SYMBOL_GPL(panic_timeout);
38 44
39ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 45ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -49,15 +55,6 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 55long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 56EXPORT_SYMBOL(panic_blink);
51 57
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
61/** 58/**
62 * panic - halt the system 59 * panic - halt the system
63 * @fmt: The text string to print 60 * @fmt: The text string to print
@@ -66,34 +63,19 @@ void __weak panic_smp_self_stop(void)
66 * 63 *
67 * This function never returns. 64 * This function never returns.
68 */ 65 */
69void panic(const char *fmt, ...) 66NORET_TYPE void panic(const char * fmt, ...)
70{ 67{
71 static DEFINE_SPINLOCK(panic_lock);
72 static char buf[1024]; 68 static char buf[1024];
73 va_list args; 69 va_list args;
74 long i, i_next = 0; 70 long i, i_next = 0;
75 int state = 0; 71 int state = 0;
76 72
77 /* 73 /*
78 * Disable local interrupts. This will prevent panic_smp_self_stop
79 * from deadlocking the first cpu that invokes the panic, since
80 * there is nothing to prevent an interrupt handler (that runs
81 * after the panic_lock is acquired) from invoking panic again.
82 */
83 local_irq_disable();
84
85 /*
86 * It's possible to come here directly from a panic-assertion and 74 * It's possible to come here directly from a panic-assertion and
87 * not have preempt disabled. Some functions called from here want 75 * not have preempt disabled. Some functions called from here want
88 * preempt to be disabled. No point enabling it later though... 76 * preempt to be disabled. No point enabling it later though...
89 *
90 * Only one CPU is allowed to execute the panic code from here. For
91 * multiple parallel invocations of panic, all other CPUs either
92 * stop themself or will wait until they are stopped by the 1st CPU
93 * with smp_send_stop().
94 */ 77 */
95 if (!spin_trylock(&panic_lock)) 78 preempt_disable();
96 panic_smp_self_stop();
97 79
98 console_verbose(); 80 console_verbose();
99 bust_spinlocks(1); 81 bust_spinlocks(1);
@@ -102,11 +84,7 @@ void panic(const char *fmt, ...)
102 va_end(args); 84 va_end(args);
103 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 85 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
104#ifdef CONFIG_DEBUG_BUGVERBOSE 86#ifdef CONFIG_DEBUG_BUGVERBOSE
105 /* 87 dump_stack();
106 * Avoid nested stack-dumping if a panic occurs during oops processing
107 */
108 if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
109 dump_stack();
110#endif 88#endif
111 89
112 /* 90 /*
@@ -116,6 +94,8 @@ void panic(const char *fmt, ...)
116 */ 94 */
117 crash_kexec(NULL); 95 crash_kexec(NULL);
118 96
97 kmsg_dump(KMSG_DUMP_PANIC);
98
119 /* 99 /*
120 * Note smp_send_stop is the usual smp shutdown function, which 100 * Note smp_send_stop is the usual smp shutdown function, which
121 * unfortunately means it may not be hardened to work in a panic 101 * unfortunately means it may not be hardened to work in a panic
@@ -123,8 +103,6 @@ void panic(const char *fmt, ...)
123 */ 103 */
124 smp_send_stop(); 104 smp_send_stop();
125 105
126 kmsg_dump(KMSG_DUMP_PANIC);
127
128 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 106 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
129 107
130 bust_spinlocks(0); 108 bust_spinlocks(0);
@@ -205,7 +183,6 @@ static const struct tnt tnts[] = {
205 { TAINT_WARN, 'W', ' ' }, 183 { TAINT_WARN, 'W', ' ' },
206 { TAINT_CRAP, 'C', ' ' }, 184 { TAINT_CRAP, 'C', ' ' },
207 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 185 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
208 { TAINT_OOT_MODULE, 'O', ' ' },
209}; 186};
210 187
211/** 188/**
@@ -223,7 +200,6 @@ static const struct tnt tnts[] = {
223 * 'W' - Taint on warning. 200 * 'W' - Taint on warning.
224 * 'C' - modules from drivers/staging are loaded. 201 * 'C' - modules from drivers/staging are loaded.
225 * 'I' - Working around severe firmware bug. 202 * 'I' - Working around severe firmware bug.
226 * 'O' - Out-of-tree module has been loaded.
227 * 203 *
228 * The string is overwritten by the next call to print_tainted(). 204 * The string is overwritten by the next call to print_tainted().
229 */ 205 */
@@ -265,20 +241,11 @@ void add_taint(unsigned flag)
265 * Can't trust the integrity of the kernel anymore. 241 * Can't trust the integrity of the kernel anymore.
266 * We don't call directly debug_locks_off() because the issue 242 * We don't call directly debug_locks_off() because the issue
267 * is not necessarily serious enough to set oops_in_progress to 1 243 * is not necessarily serious enough to set oops_in_progress to 1
268 * Also we want to keep up lockdep for staging/out-of-tree 244 * Also we want to keep up lockdep for staging development and
269 * development and post-warning case. 245 * post-warning case.
270 */ 246 */
271 switch (flag) { 247 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
272 case TAINT_CRAP: 248 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
273 case TAINT_OOT_MODULE:
274 case TAINT_WARN:
275 case TAINT_FIRMWARE_WORKAROUND:
276 break;
277
278 default:
279 if (__debug_locks_off())
280 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
281 }
282 249
283 set_bit(flag, &tainted_mask); 250 set_bit(flag, &tainted_mask);
284} 251}
@@ -383,6 +350,11 @@ late_initcall(init_oops_id);
383void print_oops_end_marker(void) 350void print_oops_end_marker(void)
384{ 351{
385 init_oops_id(); 352 init_oops_id();
353
354 if (mach_panic_string)
355 printk(KERN_WARNING "Board Information: %s\n",
356 mach_panic_string);
357
386 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 358 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
387 (unsigned long long)oops_id); 359 (unsigned long long)oops_id);
388} 360}
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be53..22df3e0d142 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,6 +15,7 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/moduleparam.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
@@ -24,6 +25,12 @@
24#include <linux/slab.h> 25#include <linux/slab.h>
25#include <linux/ctype.h> 26#include <linux/ctype.h>
26 27
28#if 0
29#define DEBUGP printk
30#else
31#define DEBUGP(fmt, a...)
32#endif
33
27/* Protects all parameters, and incidentally kmalloced_param list. */ 34/* Protects all parameters, and incidentally kmalloced_param list. */
28static DEFINE_MUTEX(param_lock); 35static DEFINE_MUTEX(param_lock);
29 36
@@ -60,38 +67,27 @@ static void maybe_kfree_parameter(void *param)
60 } 67 }
61} 68}
62 69
63static char dash2underscore(char c) 70static inline char dash2underscore(char c)
64{ 71{
65 if (c == '-') 72 if (c == '-')
66 return '_'; 73 return '_';
67 return c; 74 return c;
68} 75}
69 76
70bool parameqn(const char *a, const char *b, size_t n) 77static inline int parameq(const char *input, const char *paramname)
71{ 78{
72 size_t i; 79 unsigned int i;
73 80 for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
74 for (i = 0; i < n; i++) { 81 if (input[i] == '\0')
75 if (dash2underscore(a[i]) != dash2underscore(b[i])) 82 return 1;
76 return false; 83 return 0;
77 }
78 return true;
79}
80
81bool parameq(const char *a, const char *b)
82{
83 return parameqn(a, b, strlen(a)+1);
84} 84}
85 85
86static int parse_one(char *param, 86static int parse_one(char *param,
87 char *val, 87 char *val,
88 const char *doing,
89 const struct kernel_param *params, 88 const struct kernel_param *params,
90 unsigned num_params, 89 unsigned num_params,
91 s16 min_level, 90 int (*handle_unknown)(char *param, char *val))
92 s16 max_level,
93 int (*handle_unknown)(char *param, char *val,
94 const char *doing))
95{ 91{
96 unsigned int i; 92 unsigned int i;
97 int err; 93 int err;
@@ -99,15 +95,11 @@ static int parse_one(char *param,
99 /* Find parameter */ 95 /* Find parameter */
100 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
101 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
102 if (params[i].level < min_level
103 || params[i].level > max_level)
104 return 0;
105 /* No one handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool 99 if (!val && params[i].ops->set != param_set_bool)
107 && params[i].ops->set != param_set_bint)
108 return -EINVAL; 100 return -EINVAL;
109 pr_debug("handling %s with %p\n", param, 101 DEBUGP("They are equal! Calling %p\n",
110 params[i].ops->set); 102 params[i].ops->set);
111 mutex_lock(&param_lock); 103 mutex_lock(&param_lock);
112 err = params[i].ops->set(val, &params[i]); 104 err = params[i].ops->set(val, &params[i]);
113 mutex_unlock(&param_lock); 105 mutex_unlock(&param_lock);
@@ -116,11 +108,11 @@ static int parse_one(char *param,
116 } 108 }
117 109
118 if (handle_unknown) { 110 if (handle_unknown) {
119 pr_debug("doing %s: %s='%s'\n", doing, param, val); 111 DEBUGP("Unknown argument: calling %p\n", handle_unknown);
120 return handle_unknown(param, val, doing); 112 return handle_unknown(param, val);
121 } 113 }
122 114
123 pr_debug("Unknown argument '%s'\n", param); 115 DEBUGP("Unknown argument `%s'\n", param);
124 return -ENOENT; 116 return -ENOENT;
125} 117}
126 118
@@ -177,47 +169,46 @@ static char *next_arg(char *args, char **param, char **val)
177} 169}
178 170
179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 171/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
180int parse_args(const char *doing, 172int parse_args(const char *name,
181 char *args, 173 char *args,
182 const struct kernel_param *params, 174 const struct kernel_param *params,
183 unsigned num, 175 unsigned num,
184 s16 min_level, 176 int (*unknown)(char *param, char *val))
185 s16 max_level,
186 int (*unknown)(char *param, char *val, const char *doing))
187{ 177{
188 char *param, *val; 178 char *param, *val;
189 179
180 DEBUGP("Parsing ARGS: %s\n", args);
181
190 /* Chew leading spaces */ 182 /* Chew leading spaces */
191 args = skip_spaces(args); 183 args = skip_spaces(args);
192 184
193 if (*args)
194 pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
195
196 while (*args) { 185 while (*args) {
197 int ret; 186 int ret;
198 int irq_was_disabled; 187 int irq_was_disabled;
199 188
200 args = next_arg(args, &param, &val); 189 args = next_arg(args, &param, &val);
201 irq_was_disabled = irqs_disabled(); 190 irq_was_disabled = irqs_disabled();
202 ret = parse_one(param, val, doing, params, num, 191 ret = parse_one(param, val, params, num, unknown);
203 min_level, max_level, unknown); 192 if (irq_was_disabled && !irqs_disabled()) {
204 if (irq_was_disabled && !irqs_disabled()) 193 printk(KERN_WARNING "parse_args(): option '%s' enabled "
205 pr_warn("%s: option '%s' enabled irq's!\n", 194 "irq's!\n", param);
206 doing, param); 195 }
207
208 switch (ret) { 196 switch (ret) {
209 case -ENOENT: 197 case -ENOENT:
210 pr_err("%s: Unknown parameter `%s'\n", doing, param); 198 printk(KERN_ERR "%s: Unknown parameter `%s'\n",
199 name, param);
211 return ret; 200 return ret;
212 case -ENOSPC: 201 case -ENOSPC:
213 pr_err("%s: `%s' too large for parameter `%s'\n", 202 printk(KERN_ERR
214 doing, val ?: "", param); 203 "%s: `%s' too large for parameter `%s'\n",
204 name, val ?: "", param);
215 return ret; 205 return ret;
216 case 0: 206 case 0:
217 break; 207 break;
218 default: 208 default:
219 pr_err("%s: `%s' invalid for parameter `%s'\n", 209 printk(KERN_ERR
220 doing, val ?: "", param); 210 "%s: `%s' invalid for parameter `%s'\n",
211 name, val ?: "", param);
221 return ret; 212 return ret;
222 } 213 }
223 } 214 }
@@ -263,7 +254,8 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
263int param_set_charp(const char *val, const struct kernel_param *kp) 254int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 255{
265 if (strlen(val) > 1024) { 256 if (strlen(val) > 1024) {
266 pr_err("%s: string parameter too long\n", kp->name); 257 printk(KERN_ERR "%s: string parameter too long\n",
258 kp->name);
267 return -ENOSPC; 259 return -ENOSPC;
268 } 260 }
269 261
@@ -304,18 +296,35 @@ EXPORT_SYMBOL(param_ops_charp);
304/* Actually could be a bool or an int, for historical reasons. */ 296/* Actually could be a bool or an int, for historical reasons. */
305int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
306{ 298{
299 bool v;
300 int ret;
301
307 /* No equals means "set"... */ 302 /* No equals means "set"... */
308 if (!val) val = "1"; 303 if (!val) val = "1";
309 304
310 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
311 return strtobool(val, kp->arg); 306 ret = strtobool(val, &v);
307 if (ret)
308 return ret;
309
310 if (kp->flags & KPARAM_ISBOOL)
311 *(bool *)kp->arg = v;
312 else
313 *(int *)kp->arg = v;
314 return 0;
312} 315}
313EXPORT_SYMBOL(param_set_bool); 316EXPORT_SYMBOL(param_set_bool);
314 317
315int param_get_bool(char *buffer, const struct kernel_param *kp) 318int param_get_bool(char *buffer, const struct kernel_param *kp)
316{ 319{
320 bool val;
321 if (kp->flags & KPARAM_ISBOOL)
322 val = *(bool *)kp->arg;
323 else
324 val = *(int *)kp->arg;
325
317 /* Y and N chosen as being relatively non-coder friendly */ 326 /* Y and N chosen as being relatively non-coder friendly */
318 return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); 327 return sprintf(buffer, "%c", val ? 'Y' : 'N');
319} 328}
320EXPORT_SYMBOL(param_get_bool); 329EXPORT_SYMBOL(param_get_bool);
321 330
@@ -333,6 +342,7 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
333 struct kernel_param dummy; 342 struct kernel_param dummy;
334 343
335 dummy.arg = &boolval; 344 dummy.arg = &boolval;
345 dummy.flags = KPARAM_ISBOOL;
336 ret = param_set_bool(val, &dummy); 346 ret = param_set_bool(val, &dummy);
337 if (ret == 0) 347 if (ret == 0)
338 *(bool *)kp->arg = !boolval; 348 *(bool *)kp->arg = !boolval;
@@ -352,36 +362,13 @@ struct kernel_param_ops param_ops_invbool = {
352}; 362};
353EXPORT_SYMBOL(param_ops_invbool); 363EXPORT_SYMBOL(param_ops_invbool);
354 364
355int param_set_bint(const char *val, const struct kernel_param *kp)
356{
357 struct kernel_param boolkp;
358 bool v;
359 int ret;
360
361 /* Match bool exactly, by re-using it. */
362 boolkp = *kp;
363 boolkp.arg = &v;
364
365 ret = param_set_bool(val, &boolkp);
366 if (ret == 0)
367 *(int *)kp->arg = v;
368 return ret;
369}
370EXPORT_SYMBOL(param_set_bint);
371
372struct kernel_param_ops param_ops_bint = {
373 .set = param_set_bint,
374 .get = param_get_int,
375};
376EXPORT_SYMBOL(param_ops_bint);
377
378/* We break the rule and mangle the string. */ 365/* We break the rule and mangle the string. */
379static int param_array(const char *name, 366static int param_array(const char *name,
380 const char *val, 367 const char *val,
381 unsigned int min, unsigned int max, 368 unsigned int min, unsigned int max,
382 void *elem, int elemsize, 369 void *elem, int elemsize,
383 int (*set)(const char *, const struct kernel_param *kp), 370 int (*set)(const char *, const struct kernel_param *kp),
384 s16 level, 371 u16 flags,
385 unsigned int *num) 372 unsigned int *num)
386{ 373{
387 int ret; 374 int ret;
@@ -391,7 +378,7 @@ static int param_array(const char *name,
391 /* Get the name right for errors. */ 378 /* Get the name right for errors. */
392 kp.name = name; 379 kp.name = name;
393 kp.arg = elem; 380 kp.arg = elem;
394 kp.level = level; 381 kp.flags = flags;
395 382
396 *num = 0; 383 *num = 0;
397 /* We expect a comma-separated list of values. */ 384 /* We expect a comma-separated list of values. */
@@ -399,7 +386,8 @@ static int param_array(const char *name,
399 int len; 386 int len;
400 387
401 if (*num == max) { 388 if (*num == max) {
402 pr_err("%s: can only take %i arguments\n", name, max); 389 printk(KERN_ERR "%s: can only take %i arguments\n",
390 name, max);
403 return -EINVAL; 391 return -EINVAL;
404 } 392 }
405 len = strcspn(val, ","); 393 len = strcspn(val, ",");
@@ -418,7 +406,8 @@ static int param_array(const char *name,
418 } while (save == ','); 406 } while (save == ',');
419 407
420 if (*num < min) { 408 if (*num < min) {
421 pr_err("%s: needs at least %i arguments\n", name, min); 409 printk(KERN_ERR "%s: needs at least %i arguments\n",
410 name, min);
422 return -EINVAL; 411 return -EINVAL;
423 } 412 }
424 return 0; 413 return 0;
@@ -430,7 +419,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
430 unsigned int temp_num; 419 unsigned int temp_num;
431 420
432 return param_array(kp->name, val, 1, arr->max, arr->elem, 421 return param_array(kp->name, val, 1, arr->max, arr->elem,
433 arr->elemsize, arr->ops->set, kp->level, 422 arr->elemsize, arr->ops->set, kp->flags,
434 arr->num ?: &temp_num); 423 arr->num ?: &temp_num);
435} 424}
436 425
@@ -477,7 +466,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
477 const struct kparam_string *kps = kp->str; 466 const struct kparam_string *kps = kp->str;
478 467
479 if (strlen(val)+1 > kps->maxlen) { 468 if (strlen(val)+1 > kps->maxlen) {
480 pr_err("%s: string doesn't fit in %u chars.\n", 469 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
481 kp->name, kps->maxlen-1); 470 kp->name, kps->maxlen-1);
482 return -ENOSPC; 471 return -ENOSPC;
483 } 472 }
@@ -747,8 +736,11 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
747#endif 736#endif
748 if (err) { 737 if (err) {
749 kobject_put(&mk->kobj); 738 kobject_put(&mk->kobj);
750 pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", 739 printk(KERN_ERR
740 "Module '%s' failed add to sysfs, error number %d\n",
751 name, err); 741 name, err);
742 printk(KERN_ERR
743 "The system will be unstable now.\n");
752 return NULL; 744 return NULL;
753 } 745 }
754 746
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006..e432057f3b2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 Nadia Yvette Chambers, IBM 4 * (C) 2002-2003 William Irwin, IBM
5 * (C) 2004 Nadia Yvette Chambers, Oracle 5 * (C) 2004 William Irwin, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -27,7 +27,7 @@
27 */ 27 */
28 28
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/export.h> 30#include <linux/module.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/rculist.h> 33#include <linux/rculist.h>
@@ -36,7 +36,6 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
40 39
41#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -79,11 +78,24 @@ struct pid_namespace init_pid_ns = {
79 .last_pid = 0, 78 .last_pid = 0,
80 .level = 0, 79 .level = 0,
81 .child_reaper = &init_task, 80 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
84}; 81};
85EXPORT_SYMBOL_GPL(init_pid_ns); 82EXPORT_SYMBOL_GPL(init_pid_ns);
86 83
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
98
87/* 99/*
88 * Note: disable interrupts while the pidmap_lock is held as an 100 * Note: disable interrupts while the pidmap_lock is held as an
89 * interrupt might come in and do read_lock(&tasklist_lock). 101 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -125,9 +137,7 @@ static int pid_before(int base, int a, int b)
125} 137}
126 138
127/* 139/*
128 * We might be racing with someone else trying to set pid_ns->last_pid 140 * We might be racing with someone else trying to set pid_ns->last_pid.
129 * at the pid allocation time (there's also a sysctl for this, but racing
130 * with this one is OK, see comment in kernel/pid_namespace.c about it).
131 * We want the winner to have the "later" value, because if the 141 * We want the winner to have the "later" value, because if the
132 * "earlier" value prevails, then a pid may get reused immediately. 142 * "earlier" value prevails, then a pid may get reused immediately.
133 * 143 *
@@ -257,23 +267,8 @@ void free_pid(struct pid *pid)
257 unsigned long flags; 267 unsigned long flags;
258 268
259 spin_lock_irqsave(&pidmap_lock, flags); 269 spin_lock_irqsave(&pidmap_lock, flags);
260 for (i = 0; i <= pid->level; i++) { 270 for (i = 0; i <= pid->level; i++)
261 struct upid *upid = pid->numbers + i; 271 hlist_del_rcu(&pid->numbers[i].pid_chain);
262 struct pid_namespace *ns = upid->ns;
263 hlist_del_rcu(&upid->pid_chain);
264 switch(--ns->nr_hashed) {
265 case 1:
266 /* When all that is left in the pid namespace
267 * is the reaper wake up the reaper. The reaper
268 * may be sleeping in zap_pid_ns_processes().
269 */
270 wake_up_process(ns->child_reaper);
271 break;
272 case 0:
273 schedule_work(&ns->proc_work);
274 break;
275 }
276 }
277 spin_unlock_irqrestore(&pidmap_lock, flags); 272 spin_unlock_irqrestore(&pidmap_lock, flags);
278 273
279 for (i = 0; i <= pid->level; i++) 274 for (i = 0; i <= pid->level; i++)
@@ -295,7 +290,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
295 goto out; 290 goto out;
296 291
297 tmp = ns; 292 tmp = ns;
298 pid->level = ns->level;
299 for (i = ns->level; i >= 0; i--) { 293 for (i = ns->level; i >= 0; i--) {
300 nr = alloc_pidmap(tmp); 294 nr = alloc_pidmap(tmp);
301 if (nr < 0) 295 if (nr < 0)
@@ -306,32 +300,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
306 tmp = tmp->parent; 300 tmp = tmp->parent;
307 } 301 }
308 302
309 if (unlikely(is_child_reaper(pid))) {
310 if (pid_ns_prepare_proc(ns))
311 goto out_free;
312 }
313
314 get_pid_ns(ns); 303 get_pid_ns(ns);
304 pid->level = ns->level;
315 atomic_set(&pid->count, 1); 305 atomic_set(&pid->count, 1);
316 for (type = 0; type < PIDTYPE_MAX; ++type) 306 for (type = 0; type < PIDTYPE_MAX; ++type)
317 INIT_HLIST_HEAD(&pid->tasks[type]); 307 INIT_HLIST_HEAD(&pid->tasks[type]);
318 308
319 upid = pid->numbers + ns->level; 309 upid = pid->numbers + ns->level;
320 spin_lock_irq(&pidmap_lock); 310 spin_lock_irq(&pidmap_lock);
321 if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) 311 for ( ; upid >= pid->numbers; --upid)
322 goto out_unlock;
323 for ( ; upid >= pid->numbers; --upid) {
324 hlist_add_head_rcu(&upid->pid_chain, 312 hlist_add_head_rcu(&upid->pid_chain,
325 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 313 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
326 upid->ns->nr_hashed++;
327 }
328 spin_unlock_irq(&pidmap_lock); 314 spin_unlock_irq(&pidmap_lock);
329 315
330out: 316out:
331 return pid; 317 return pid;
332 318
333out_unlock:
334 spin_unlock(&pidmap_lock);
335out_free: 319out_free:
336 while (++i <= ns->level) 320 while (++i <= ns->level)
337 free_pidmap(pid->numbers + i); 321 free_pidmap(pid->numbers + i);
@@ -341,13 +325,6 @@ out_free:
341 goto out; 325 goto out;
342} 326}
343 327
344void disable_pid_allocation(struct pid_namespace *ns)
345{
346 spin_lock_irq(&pidmap_lock);
347 ns->nr_hashed &= ~PIDNS_HASH_ADDING;
348 spin_unlock_irq(&pidmap_lock);
349}
350
351struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 328struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
352{ 329{
353 struct hlist_node *elem; 330 struct hlist_node *elem;
@@ -365,7 +342,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
365 342
366struct pid *find_vpid(int nr) 343struct pid *find_vpid(int nr)
367{ 344{
368 return find_pid_ns(nr, task_active_pid_ns(current)); 345 return find_pid_ns(nr, current->nsproxy->pid_ns);
369} 346}
370EXPORT_SYMBOL_GPL(find_vpid); 347EXPORT_SYMBOL_GPL(find_vpid);
371 348
@@ -441,15 +418,13 @@ EXPORT_SYMBOL(pid_task);
441 */ 418 */
442struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 419struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
443{ 420{
444 rcu_lockdep_assert(rcu_read_lock_held(), 421 rcu_lockdep_assert(rcu_read_lock_held());
445 "find_task_by_pid_ns() needs rcu_read_lock()"
446 " protection");
447 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 422 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
448} 423}
449 424
450struct task_struct *find_task_by_vpid(pid_t vnr) 425struct task_struct *find_task_by_vpid(pid_t vnr)
451{ 426{
452 return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); 427 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
453} 428}
454 429
455struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 430struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -500,11 +475,10 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
500 } 475 }
501 return nr; 476 return nr;
502} 477}
503EXPORT_SYMBOL_GPL(pid_nr_ns);
504 478
505pid_t pid_vnr(struct pid *pid) 479pid_t pid_vnr(struct pid *pid)
506{ 480{
507 return pid_nr_ns(pid, task_active_pid_ns(current)); 481 return pid_nr_ns(pid, current->nsproxy->pid_ns);
508} 482}
509EXPORT_SYMBOL_GPL(pid_vnr); 483EXPORT_SYMBOL_GPL(pid_vnr);
510 484
@@ -515,7 +489,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
515 489
516 rcu_read_lock(); 490 rcu_read_lock();
517 if (!ns) 491 if (!ns)
518 ns = task_active_pid_ns(current); 492 ns = current->nsproxy->pid_ns;
519 if (likely(pid_alive(task))) { 493 if (likely(pid_alive(task))) {
520 if (type != PIDTYPE_PID) 494 if (type != PIDTYPE_PID)
521 task = task->group_leader; 495 task = task->group_leader;
@@ -565,13 +539,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
565 */ 539 */
566void __init pidhash_init(void) 540void __init pidhash_init(void)
567{ 541{
568 unsigned int i, pidhash_size; 542 int i, pidhash_size;
569 543
570 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 544 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
571 HASH_EARLY | HASH_SMALL, 545 HASH_EARLY | HASH_SMALL,
572 &pidhash_shift, NULL, 546 &pidhash_shift, NULL, 4096);
573 0, 4096); 547 pidhash_size = 1 << pidhash_shift;
574 pidhash_size = 1U << pidhash_shift;
575 548
576 for (i = 0; i < pidhash_size; i++) 549 for (i = 0; i < pidhash_size; i++)
577 INIT_HLIST_HEAD(&pid_hash[i]); 550 INIT_HLIST_HEAD(&pid_hash[i]);
@@ -579,9 +552,6 @@ void __init pidhash_init(void)
579 552
580void __init pidmap_init(void) 553void __init pidmap_init(void)
581{ 554{
582 /* Veryify no one has done anything silly */
583 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
584
585 /* bump default and minimum pid_max based on number of cpus */ 555 /* bump default and minimum pid_max based on number of cpus */
586 pid_max = min(pid_max_max, max_t(int, pid_max, 556 pid_max = min(pid_max_max, max_t(int, pid_max,
587 PIDS_PER_CPU_DEFAULT * num_possible_cpus())); 557 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -593,7 +563,6 @@ void __init pidmap_init(void)
593 /* Reserve PID 0. We never call free_pidmap(0) */ 563 /* Reserve PID 0. We never call free_pidmap(0) */
594 set_bit(0, init_pid_ns.pidmap[0].page); 564 set_bit(0, init_pid_ns.pidmap[0].page);
595 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 565 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
596 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
597 566
598 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 567 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
599 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 568 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c602..e9c9adc84ca 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,14 +10,11 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
14#include <linux/syscalls.h> 13#include <linux/syscalls.h>
15#include <linux/err.h> 14#include <linux/err.h>
16#include <linux/acct.h> 15#include <linux/acct.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
19#include <linux/reboot.h>
20#include <linux/export.h>
21 18
22#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
23 20
@@ -72,29 +69,12 @@ err_alloc:
72 return NULL; 69 return NULL;
73} 70}
74 71
75static void proc_cleanup_work(struct work_struct *work) 72static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
82#define MAX_PID_NS_LEVEL 32
83
84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
86{ 73{
87 struct pid_namespace *ns; 74 struct pid_namespace *ns;
88 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
89 int i; 76 int i, err = -ENOMEM;
90 int err;
91 77
92 if (level > MAX_PID_NS_LEVEL) {
93 err = -EINVAL;
94 goto out;
95 }
96
97 err = -ENOMEM;
98 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
99 if (ns == NULL) 79 if (ns == NULL)
100 goto out; 80 goto out;
@@ -107,16 +87,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
107 if (ns->pid_cachep == NULL) 87 if (ns->pid_cachep == NULL)
108 goto out_free_map; 88 goto out_free_map;
109 89
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
114 kref_init(&ns->kref); 90 kref_init(&ns->kref);
115 ns->level = level; 91 ns->level = level;
116 ns->parent = get_pid_ns(parent_pid_ns); 92 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 ns->nr_hashed = PIDNS_HASH_ADDING;
119 INIT_WORK(&ns->proc_work, proc_cleanup_work);
120 93
121 set_bit(0, ns->pidmap[0].page); 94 set_bit(0, ns->pidmap[0].page);
122 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 95 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -124,8 +97,14 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
124 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
125 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
126 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
127 return ns; 104 return ns;
128 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
129out_free_map: 108out_free_map:
130 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
131out_free: 110out_free:
@@ -138,57 +117,38 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
138{ 117{
139 int i; 118 int i;
140 119
141 proc_free_inum(ns->proc_inum);
142 for (i = 0; i < PIDMAP_ENTRIES; i++) 120 for (i = 0; i < PIDMAP_ENTRIES; i++)
143 kfree(ns->pidmap[i].page); 121 kfree(ns->pidmap[i].page);
144 put_user_ns(ns->user_ns);
145 kmem_cache_free(pid_ns_cachep, ns); 122 kmem_cache_free(pid_ns_cachep, ns);
146} 123}
147 124
148struct pid_namespace *copy_pid_ns(unsigned long flags, 125struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
149 struct user_namespace *user_ns, struct pid_namespace *old_ns)
150{ 126{
151 if (!(flags & CLONE_NEWPID)) 127 if (!(flags & CLONE_NEWPID))
152 return get_pid_ns(old_ns); 128 return get_pid_ns(old_ns);
153 if (task_active_pid_ns(current) != old_ns) 129 if (flags & (CLONE_THREAD|CLONE_PARENT))
154 return ERR_PTR(-EINVAL); 130 return ERR_PTR(-EINVAL);
155 return create_pid_namespace(user_ns, old_ns); 131 return create_pid_namespace(old_ns);
156} 132}
157 133
158static void free_pid_ns(struct kref *kref) 134void free_pid_ns(struct kref *kref)
159{ 135{
160 struct pid_namespace *ns; 136 struct pid_namespace *ns, *parent;
161 137
162 ns = container_of(kref, struct pid_namespace, kref); 138 ns = container_of(kref, struct pid_namespace, kref);
163 destroy_pid_namespace(ns);
164}
165 139
166void put_pid_ns(struct pid_namespace *ns) 140 parent = ns->parent;
167{ 141 destroy_pid_namespace(ns);
168 struct pid_namespace *parent;
169 142
170 while (ns != &init_pid_ns) { 143 if (parent != NULL)
171 parent = ns->parent; 144 put_pid_ns(parent);
172 if (!kref_put(&ns->kref, free_pid_ns))
173 break;
174 ns = parent;
175 }
176} 145}
177EXPORT_SYMBOL_GPL(put_pid_ns);
178 146
179void zap_pid_ns_processes(struct pid_namespace *pid_ns) 147void zap_pid_ns_processes(struct pid_namespace *pid_ns)
180{ 148{
181 int nr; 149 int nr;
182 int rc; 150 int rc;
183 struct task_struct *task, *me = current; 151 struct task_struct *task;
184
185 /* Don't allow any more processes into the pid namespace */
186 disable_pid_allocation(pid_ns);
187
188 /* Ignore SIGCHLD causing any terminated children to autoreap */
189 spin_lock_irq(&me->sighand->siglock);
190 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
191 spin_unlock_irq(&me->sighand->siglock);
192 152
193 /* 153 /*
194 * The last thread in the cgroup-init thread group is terminating. 154 * The last thread in the cgroup-init thread group is terminating.
@@ -208,9 +168,13 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
208 while (nr > 0) { 168 while (nr > 0) {
209 rcu_read_lock(); 169 rcu_read_lock();
210 170
171 /*
172 * Any nested-container's init processes won't ignore the
173 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
174 */
211 task = pid_task(find_vpid(nr), PIDTYPE_PID); 175 task = pid_task(find_vpid(nr), PIDTYPE_PID);
212 if (task && !__fatal_signal_pending(task)) 176 if (task)
213 send_sig_info(SIGKILL, SEND_SIG_FORCED, task); 177 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
214 178
215 rcu_read_unlock(); 179 rcu_read_unlock();
216 180
@@ -218,165 +182,18 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
218 } 182 }
219 read_unlock(&tasklist_lock); 183 read_unlock(&tasklist_lock);
220 184
221 /* Firstly reap the EXIT_ZOMBIE children we may have. */
222 do { 185 do {
223 clear_thread_flag(TIF_SIGPENDING); 186 clear_thread_flag(TIF_SIGPENDING);
224 rc = sys_wait4(-1, NULL, __WALL, NULL); 187 rc = sys_wait4(-1, NULL, __WALL, NULL);
225 } while (rc != -ECHILD); 188 } while (rc != -ECHILD);
226 189
227 /*
228 * sys_wait4() above can't reap the TASK_DEAD children.
229 * Make sure they all go away, see free_pid().
230 */
231 for (;;) {
232 set_current_state(TASK_UNINTERRUPTIBLE);
233 if (pid_ns->nr_hashed == 1)
234 break;
235 schedule();
236 }
237 __set_current_state(TASK_RUNNING);
238
239 if (pid_ns->reboot)
240 current->signal->group_exit_code = pid_ns->reboot;
241
242 acct_exit_ns(pid_ns); 190 acct_exit_ns(pid_ns);
243 return; 191 return;
244} 192}
245 193
246#ifdef CONFIG_CHECKPOINT_RESTORE
247static int pid_ns_ctl_handler(struct ctl_table *table, int write,
248 void __user *buffer, size_t *lenp, loff_t *ppos)
249{
250 struct pid_namespace *pid_ns = task_active_pid_ns(current);
251 struct ctl_table tmp = *table;
252
253 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
254 return -EPERM;
255
256 /*
257 * Writing directly to ns' last_pid field is OK, since this field
258 * is volatile in a living namespace anyway and a code writing to
259 * it should synchronize its usage with external means.
260 */
261
262 tmp.data = &pid_ns->last_pid;
263 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
264}
265
266extern int pid_max;
267static int zero = 0;
268static struct ctl_table pid_ns_ctl_table[] = {
269 {
270 .procname = "ns_last_pid",
271 .maxlen = sizeof(int),
272 .mode = 0666, /* permissions are checked in the handler */
273 .proc_handler = pid_ns_ctl_handler,
274 .extra1 = &zero,
275 .extra2 = &pid_max,
276 },
277 { }
278};
279static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
280#endif /* CONFIG_CHECKPOINT_RESTORE */
281
282int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
283{
284 if (pid_ns == &init_pid_ns)
285 return 0;
286
287 switch (cmd) {
288 case LINUX_REBOOT_CMD_RESTART2:
289 case LINUX_REBOOT_CMD_RESTART:
290 pid_ns->reboot = SIGHUP;
291 break;
292
293 case LINUX_REBOOT_CMD_POWER_OFF:
294 case LINUX_REBOOT_CMD_HALT:
295 pid_ns->reboot = SIGINT;
296 break;
297 default:
298 return -EINVAL;
299 }
300
301 read_lock(&tasklist_lock);
302 force_sig(SIGKILL, pid_ns->child_reaper);
303 read_unlock(&tasklist_lock);
304
305 do_exit(0);
306
307 /* Not reached */
308 return 0;
309}
310
311static void *pidns_get(struct task_struct *task)
312{
313 struct pid_namespace *ns;
314
315 rcu_read_lock();
316 ns = get_pid_ns(task_active_pid_ns(task));
317 rcu_read_unlock();
318
319 return ns;
320}
321
322static void pidns_put(void *ns)
323{
324 put_pid_ns(ns);
325}
326
327static int pidns_install(struct nsproxy *nsproxy, void *ns)
328{
329 struct pid_namespace *active = task_active_pid_ns(current);
330 struct pid_namespace *ancestor, *new = ns;
331
332 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
333 !nsown_capable(CAP_SYS_ADMIN))
334 return -EPERM;
335
336 /*
337 * Only allow entering the current active pid namespace
338 * or a child of the current active pid namespace.
339 *
340 * This is required for fork to return a usable pid value and
341 * this maintains the property that processes and their
342 * children can not escape their current pid namespace.
343 */
344 if (new->level < active->level)
345 return -EINVAL;
346
347 ancestor = new;
348 while (ancestor->level > active->level)
349 ancestor = ancestor->parent;
350 if (ancestor != active)
351 return -EINVAL;
352
353 put_pid_ns(nsproxy->pid_ns);
354 nsproxy->pid_ns = get_pid_ns(new);
355 return 0;
356}
357
358static unsigned int pidns_inum(void *ns)
359{
360 struct pid_namespace *pid_ns = ns;
361 return pid_ns->proc_inum;
362}
363
364const struct proc_ns_operations pidns_operations = {
365 .name = "pid",
366 .type = CLONE_NEWPID,
367 .get = pidns_get,
368 .put = pidns_put,
369 .install = pidns_install,
370 .inum = pidns_inum,
371};
372
373static __init int pid_namespaces_init(void) 194static __init int pid_namespaces_init(void)
374{ 195{
375 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
376
377#ifdef CONFIG_CHECKPOINT_RESTORE
378 register_sysctl_paths(kern_path, pid_ns_ctl_table);
379#endif
380 return 0; 197 return 0;
381} 198}
382 199
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d..640ded8f5c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,7 +9,6 @@
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h>
13 12
14/* 13/*
15 * Called after updating RLIMIT_CPU to run cpu timer and update 14 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -79,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
79 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
80 return now.sched < then.sched; 79 return now.sched < then.sched;
81 } else { 80 } else {
82 return now.cpu < then.cpu; 81 return cputime_lt(now.cpu, then.cpu);
83 } 82 }
84} 83}
85static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -89,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
89 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
90 acc->sched += val.sched; 89 acc->sched += val.sched;
91 } else { 90 } else {
92 acc->cpu += val.cpu; 91 acc->cpu = cputime_add(acc->cpu, val.cpu);
93 } 92 }
94} 93}
95static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -99,12 +98,25 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
99 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
100 a.sched -= b.sched; 99 a.sched -= b.sched;
101 } else { 100 } else {
102 a.cpu -= b.cpu; 101 a.cpu = cputime_sub(a.cpu, b.cpu);
103 } 102 }
104 return a; 103 return a;
105} 104}
106 105
107/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
108 * Update expiry time from increment, and increase overrun count, 120 * Update expiry time from increment, and increase overrun count,
109 * given the current clock sample. 121 * given the current clock sample.
110 */ 122 */
@@ -136,26 +148,28 @@ static void bump_cpu_timer(struct k_itimer *timer,
136 } else { 148 } else {
137 cputime_t delta, incr; 149 cputime_t delta, incr;
138 150
139 if (now.cpu < timer->it.cpu.expires.cpu) 151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
140 return; 152 return;
141 incr = timer->it.cpu.incr.cpu; 153 incr = timer->it.cpu.incr.cpu;
142 delta = now.cpu + incr - timer->it.cpu.expires.cpu; 154 delta = cputime_sub(cputime_add(now.cpu, incr),
155 timer->it.cpu.expires.cpu);
143 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 156 /* Don't use (incr*2 < delta), incr*2 might overflow. */
144 for (i = 0; incr < delta - incr; i++) 157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
145 incr += incr; 158 incr = cputime_add(incr, incr);
146 for (; i >= 0; incr = incr >> 1, i--) { 159 for (; i >= 0; incr = cputime_halve(incr), i--) {
147 if (delta < incr) 160 if (cputime_lt(delta, incr))
148 continue; 161 continue;
149 timer->it.cpu.expires.cpu += incr; 162 timer->it.cpu.expires.cpu =
163 cputime_add(timer->it.cpu.expires.cpu, incr);
150 timer->it_overrun += 1 << i; 164 timer->it_overrun += 1 << i;
151 delta -= incr; 165 delta = cputime_sub(delta, incr);
152 } 166 }
153 } 167 }
154} 168}
155 169
156static inline cputime_t prof_ticks(struct task_struct *p) 170static inline cputime_t prof_ticks(struct task_struct *p)
157{ 171{
158 return p->utime + p->stime; 172 return cputime_add(p->utime, p->stime);
159} 173}
160static inline cputime_t virt_ticks(struct task_struct *p) 174static inline cputime_t virt_ticks(struct task_struct *p)
161{ 175{
@@ -218,12 +232,36 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
218 return 0; 232 return 0;
219} 233}
220 234
235void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
236{
237 struct signal_struct *sig = tsk->signal;
238 struct task_struct *t;
239
240 times->utime = sig->utime;
241 times->stime = sig->stime;
242 times->sum_exec_runtime = sig->sum_sched_runtime;
243
244 rcu_read_lock();
245 /* make sure we can trust tsk->thread_group list */
246 if (!likely(pid_alive(tsk)))
247 goto out;
248
249 t = tsk;
250 do {
251 times->utime = cputime_add(times->utime, t->utime);
252 times->stime = cputime_add(times->stime, t->stime);
253 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t);
255out:
256 rcu_read_unlock();
257}
258
221static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
222{ 260{
223 if (b->utime > a->utime) 261 if (cputime_gt(b->utime, a->utime))
224 a->utime = b->utime; 262 a->utime = b->utime;
225 263
226 if (b->stime > a->stime) 264 if (cputime_gt(b->stime, a->stime))
227 a->stime = b->stime; 265 a->stime = b->stime;
228 266
229 if (b->sum_exec_runtime > a->sum_exec_runtime) 267 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -244,13 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
244 * it. 282 * it.
245 */ 283 */
246 thread_group_cputime(tsk, &sum); 284 thread_group_cputime(tsk, &sum);
247 raw_spin_lock_irqsave(&cputimer->lock, flags); 285 spin_lock_irqsave(&cputimer->lock, flags);
248 cputimer->running = 1; 286 cputimer->running = 1;
249 update_gt_cputime(&cputimer->cputime, &sum); 287 update_gt_cputime(&cputimer->cputime, &sum);
250 } else 288 } else
251 raw_spin_lock_irqsave(&cputimer->lock, flags); 289 spin_lock_irqsave(&cputimer->lock, flags);
252 *times = cputimer->cputime; 290 *times = cputimer->cputime;
253 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 291 spin_unlock_irqrestore(&cputimer->lock, flags);
254} 292}
255 293
256/* 294/*
@@ -268,7 +306,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
268 return -EINVAL; 306 return -EINVAL;
269 case CPUCLOCK_PROF: 307 case CPUCLOCK_PROF:
270 thread_group_cputime(p, &cputime); 308 thread_group_cputime(p, &cputime);
271 cpu->cpu = cputime.utime + cputime.stime; 309 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
272 break; 310 break;
273 case CPUCLOCK_VIRT: 311 case CPUCLOCK_VIRT:
274 thread_group_cputime(p, &cputime); 312 thread_group_cputime(p, &cputime);
@@ -432,24 +470,26 @@ static void cleanup_timers(struct list_head *head,
432 unsigned long long sum_exec_runtime) 470 unsigned long long sum_exec_runtime)
433{ 471{
434 struct cpu_timer_list *timer, *next; 472 struct cpu_timer_list *timer, *next;
435 cputime_t ptime = utime + stime; 473 cputime_t ptime = cputime_add(utime, stime);
436 474
437 list_for_each_entry_safe(timer, next, head, entry) { 475 list_for_each_entry_safe(timer, next, head, entry) {
438 list_del_init(&timer->entry); 476 list_del_init(&timer->entry);
439 if (timer->expires.cpu < ptime) { 477 if (cputime_lt(timer->expires.cpu, ptime)) {
440 timer->expires.cpu = 0; 478 timer->expires.cpu = cputime_zero;
441 } else { 479 } else {
442 timer->expires.cpu -= ptime; 480 timer->expires.cpu = cputime_sub(timer->expires.cpu,
481 ptime);
443 } 482 }
444 } 483 }
445 484
446 ++head; 485 ++head;
447 list_for_each_entry_safe(timer, next, head, entry) { 486 list_for_each_entry_safe(timer, next, head, entry) {
448 list_del_init(&timer->entry); 487 list_del_init(&timer->entry);
449 if (timer->expires.cpu < utime) { 488 if (cputime_lt(timer->expires.cpu, utime)) {
450 timer->expires.cpu = 0; 489 timer->expires.cpu = cputime_zero;
451 } else { 490 } else {
452 timer->expires.cpu -= utime; 491 timer->expires.cpu = cputime_sub(timer->expires.cpu,
492 utime);
453 } 493 }
454 } 494 }
455 495
@@ -471,8 +511,6 @@ static void cleanup_timers(struct list_head *head,
471 */ 511 */
472void posix_cpu_timers_exit(struct task_struct *tsk) 512void posix_cpu_timers_exit(struct task_struct *tsk)
473{ 513{
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long));
476 cleanup_timers(tsk->cpu_timers, 514 cleanup_timers(tsk->cpu_timers,
477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 515 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
478 516
@@ -482,7 +520,8 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
482 struct signal_struct *const sig = tsk->signal; 520 struct signal_struct *const sig = tsk->signal;
483 521
484 cleanup_timers(tsk->signal->cpu_timers, 522 cleanup_timers(tsk->signal->cpu_timers,
485 tsk->utime + sig->utime, tsk->stime + sig->stime, 523 cputime_add(tsk->utime, sig->utime),
524 cputime_add(tsk->stime, sig->stime),
486 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
487} 526}
488 527
@@ -501,7 +540,8 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
501 540
502static inline int expires_gt(cputime_t expires, cputime_t new_exp) 541static inline int expires_gt(cputime_t expires, cputime_t new_exp)
503{ 542{
504 return expires == 0 || expires > new_exp; 543 return cputime_eq(expires, cputime_zero) ||
544 cputime_gt(expires, new_exp);
505} 545}
506 546
507/* 547/*
@@ -611,7 +651,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
611 default: 651 default:
612 return -EINVAL; 652 return -EINVAL;
613 case CPUCLOCK_PROF: 653 case CPUCLOCK_PROF:
614 cpu->cpu = cputime.utime + cputime.stime; 654 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
615 break; 655 break;
616 case CPUCLOCK_VIRT: 656 case CPUCLOCK_VIRT:
617 cpu->cpu = cputime.utime; 657 cpu->cpu = cputime.utime;
@@ -878,12 +918,12 @@ static void check_thread_timers(struct task_struct *tsk,
878 unsigned long soft; 918 unsigned long soft;
879 919
880 maxfire = 20; 920 maxfire = 20;
881 tsk->cputime_expires.prof_exp = 0; 921 tsk->cputime_expires.prof_exp = cputime_zero;
882 while (!list_empty(timers)) { 922 while (!list_empty(timers)) {
883 struct cpu_timer_list *t = list_first_entry(timers, 923 struct cpu_timer_list *t = list_first_entry(timers,
884 struct cpu_timer_list, 924 struct cpu_timer_list,
885 entry); 925 entry);
886 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { 926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
887 tsk->cputime_expires.prof_exp = t->expires.cpu; 927 tsk->cputime_expires.prof_exp = t->expires.cpu;
888 break; 928 break;
889 } 929 }
@@ -893,12 +933,12 @@ static void check_thread_timers(struct task_struct *tsk,
893 933
894 ++timers; 934 ++timers;
895 maxfire = 20; 935 maxfire = 20;
896 tsk->cputime_expires.virt_exp = 0; 936 tsk->cputime_expires.virt_exp = cputime_zero;
897 while (!list_empty(timers)) { 937 while (!list_empty(timers)) {
898 struct cpu_timer_list *t = list_first_entry(timers, 938 struct cpu_timer_list *t = list_first_entry(timers,
899 struct cpu_timer_list, 939 struct cpu_timer_list,
900 entry); 940 entry);
901 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { 941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
902 tsk->cputime_expires.virt_exp = t->expires.cpu; 942 tsk->cputime_expires.virt_exp = t->expires.cpu;
903 break; 943 break;
904 } 944 }
@@ -959,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
959 struct thread_group_cputimer *cputimer = &sig->cputimer; 999 struct thread_group_cputimer *cputimer = &sig->cputimer;
960 unsigned long flags; 1000 unsigned long flags;
961 1001
962 raw_spin_lock_irqsave(&cputimer->lock, flags); 1002 spin_lock_irqsave(&cputimer->lock, flags);
963 cputimer->running = 0; 1003 cputimer->running = 0;
964 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 1004 spin_unlock_irqrestore(&cputimer->lock, flags);
965} 1005}
966 1006
967static u32 onecputick; 1007static u32 onecputick;
@@ -969,19 +1009,20 @@ static u32 onecputick;
969static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
970 cputime_t *expires, cputime_t cur_time, int signo) 1010 cputime_t *expires, cputime_t cur_time, int signo)
971{ 1011{
972 if (!it->expires) 1012 if (cputime_eq(it->expires, cputime_zero))
973 return; 1013 return;
974 1014
975 if (cur_time >= it->expires) { 1015 if (cputime_ge(cur_time, it->expires)) {
976 if (it->incr) { 1016 if (!cputime_eq(it->incr, cputime_zero)) {
977 it->expires += it->incr; 1017 it->expires = cputime_add(it->expires, it->incr);
978 it->error += it->incr_error; 1018 it->error += it->incr_error;
979 if (it->error >= onecputick) { 1019 if (it->error >= onecputick) {
980 it->expires -= cputime_one_jiffy; 1020 it->expires = cputime_sub(it->expires,
1021 cputime_one_jiffy);
981 it->error -= onecputick; 1022 it->error -= onecputick;
982 } 1023 }
983 } else { 1024 } else {
984 it->expires = 0; 1025 it->expires = cputime_zero;
985 } 1026 }
986 1027
987 trace_itimer_expire(signo == SIGPROF ? 1028 trace_itimer_expire(signo == SIGPROF ?
@@ -990,7 +1031,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
990 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
991 } 1032 }
992 1033
993 if (it->expires && (!*expires || it->expires < *expires)) { 1034 if (!cputime_eq(it->expires, cputime_zero) &&
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
994 *expires = it->expires; 1037 *expires = it->expires;
995 } 1038 }
996} 1039}
@@ -1005,7 +1048,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1005 */ 1048 */
1006static inline int task_cputime_zero(const struct task_cputime *cputime) 1049static inline int task_cputime_zero(const struct task_cputime *cputime)
1007{ 1050{
1008 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) 1051 if (cputime_eq(cputime->utime, cputime_zero) &&
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1009 return 1; 1054 return 1;
1010 return 0; 1055 return 0;
1011} 1056}
@@ -1031,15 +1076,15 @@ static void check_process_timers(struct task_struct *tsk,
1031 */ 1076 */
1032 thread_group_cputimer(tsk, &cputime); 1077 thread_group_cputimer(tsk, &cputime);
1033 utime = cputime.utime; 1078 utime = cputime.utime;
1034 ptime = utime + cputime.stime; 1079 ptime = cputime_add(utime, cputime.stime);
1035 sum_sched_runtime = cputime.sum_exec_runtime; 1080 sum_sched_runtime = cputime.sum_exec_runtime;
1036 maxfire = 20; 1081 maxfire = 20;
1037 prof_expires = 0; 1082 prof_expires = cputime_zero;
1038 while (!list_empty(timers)) { 1083 while (!list_empty(timers)) {
1039 struct cpu_timer_list *tl = list_first_entry(timers, 1084 struct cpu_timer_list *tl = list_first_entry(timers,
1040 struct cpu_timer_list, 1085 struct cpu_timer_list,
1041 entry); 1086 entry);
1042 if (!--maxfire || ptime < tl->expires.cpu) { 1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
1043 prof_expires = tl->expires.cpu; 1088 prof_expires = tl->expires.cpu;
1044 break; 1089 break;
1045 } 1090 }
@@ -1049,12 +1094,12 @@ static void check_process_timers(struct task_struct *tsk,
1049 1094
1050 ++timers; 1095 ++timers;
1051 maxfire = 20; 1096 maxfire = 20;
1052 virt_expires = 0; 1097 virt_expires = cputime_zero;
1053 while (!list_empty(timers)) { 1098 while (!list_empty(timers)) {
1054 struct cpu_timer_list *tl = list_first_entry(timers, 1099 struct cpu_timer_list *tl = list_first_entry(timers,
1055 struct cpu_timer_list, 1100 struct cpu_timer_list,
1056 entry); 1101 entry);
1057 if (!--maxfire || utime < tl->expires.cpu) { 1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
1058 virt_expires = tl->expires.cpu; 1103 virt_expires = tl->expires.cpu;
1059 break; 1104 break;
1060 } 1105 }
@@ -1109,7 +1154,8 @@ static void check_process_timers(struct task_struct *tsk,
1109 } 1154 }
1110 } 1155 }
1111 x = secs_to_cputime(soft); 1156 x = secs_to_cputime(soft);
1112 if (!prof_expires || x < prof_expires) { 1157 if (cputime_eq(prof_expires, cputime_zero) ||
1158 cputime_lt(x, prof_expires)) {
1113 prof_expires = x; 1159 prof_expires = x;
1114 } 1160 }
1115 } 1161 }
@@ -1203,9 +1249,12 @@ out:
1203static inline int task_cputime_expired(const struct task_cputime *sample, 1249static inline int task_cputime_expired(const struct task_cputime *sample,
1204 const struct task_cputime *expires) 1250 const struct task_cputime *expires)
1205{ 1251{
1206 if (expires->utime && sample->utime >= expires->utime) 1252 if (!cputime_eq(expires->utime, cputime_zero) &&
1253 cputime_ge(sample->utime, expires->utime))
1207 return 1; 1254 return 1;
1208 if (expires->stime && sample->utime + sample->stime >= expires->stime) 1255 if (!cputime_eq(expires->stime, cputime_zero) &&
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1209 return 1; 1258 return 1;
1210 if (expires->sum_exec_runtime != 0 && 1259 if (expires->sum_exec_runtime != 0 &&
1211 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1260 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1242,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1242 if (sig->cputimer.running) { 1291 if (sig->cputimer.running) {
1243 struct task_cputime group_sample; 1292 struct task_cputime group_sample;
1244 1293
1245 raw_spin_lock(&sig->cputimer.lock); 1294 spin_lock(&sig->cputimer.lock);
1246 group_sample = sig->cputimer.cputime; 1295 group_sample = sig->cputimer.cputime;
1247 raw_spin_unlock(&sig->cputimer.lock); 1296 spin_unlock(&sig->cputimer.lock);
1248 1297
1249 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1298 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1250 return 1; 1299 return 1;
@@ -1340,18 +1389,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1340 * it to be relative, *newval argument is relative and we update 1389 * it to be relative, *newval argument is relative and we update
1341 * it to be absolute. 1390 * it to be absolute.
1342 */ 1391 */
1343 if (*oldval) { 1392 if (!cputime_eq(*oldval, cputime_zero)) {
1344 if (*oldval <= now.cpu) { 1393 if (cputime_le(*oldval, now.cpu)) {
1345 /* Just about to fire. */ 1394 /* Just about to fire. */
1346 *oldval = cputime_one_jiffy; 1395 *oldval = cputime_one_jiffy;
1347 } else { 1396 } else {
1348 *oldval -= now.cpu; 1397 *oldval = cputime_sub(*oldval, now.cpu);
1349 } 1398 }
1350 } 1399 }
1351 1400
1352 if (!*newval) 1401 if (cputime_eq(*newval, cputime_zero))
1353 return; 1402 return;
1354 *newval += now.cpu; 1403 *newval = cputime_add(*newval, now.cpu);
1355 } 1404 }
1356 1405
1357 /* 1406 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b70..4556182527f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/export.h> 49#include <linux/module.h>
50 50
51/* 51/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 52 * Management arrays for POSIX timers. Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180..fcf5a834c4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,73 @@ config SUSPEND_FREEZER
18 18
19 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
20 20
21config HAS_WAKELOCK
22 bool
23
24config HAS_EARLYSUSPEND
25 bool
26
27config WAKELOCK
28 bool "Wake lock"
29 depends on PM && RTC_CLASS
30 default n
31 select HAS_WAKELOCK
32 ---help---
33 Enable wakelocks. When user space request a sleep state the
34 sleep request will be delayed until no wake locks are held.
35
36config WAKELOCK_STAT
37 bool "Wake lock stats"
38 depends on WAKELOCK
39 default y
40 ---help---
41 Report wake lock stats in /proc/wakelocks
42
43config USER_WAKELOCK
44 bool "Userspace wake locks"
45 depends on WAKELOCK
46 default y
47 ---help---
48 User-space wake lock api. Write "lockname" or "lockname timeout"
49 to /sys/power/wake_lock lock and if needed create a wake lock.
50 Write "lockname" to /sys/power/wake_unlock to unlock a user wake
51 lock.
52
53config EARLYSUSPEND
54 bool "Early suspend"
55 depends on WAKELOCK
56 default y
57 select HAS_EARLYSUSPEND
58 ---help---
59 Call early suspend handlers when the user requested sleep state
60 changes.
61
62choice
63 prompt "User-space screen access"
64 default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE
65 default CONSOLE_EARLYSUSPEND
66 depends on HAS_EARLYSUSPEND
67
68 config NO_USER_SPACE_SCREEN_ACCESS_CONTROL
69 bool "None"
70
71 config CONSOLE_EARLYSUSPEND
72 bool "Console switch on early-suspend"
73 depends on HAS_EARLYSUSPEND && VT
74 ---help---
75 Register early suspend handler to perform a console switch to
76 when user-space should stop drawing to the screen and a switch
77 back when it should resume.
78
79 config FB_EARLYSUSPEND
80 bool "Sysfs interface"
81 depends on HAS_EARLYSUSPEND
82 ---help---
83 Register early suspend handler that notifies and waits for
84 user-space through sysfs when user-space should stop drawing
85 to the screen and notifies user-space when it should resume.
86endchoice
87
21config HIBERNATE_CALLBACKS 88config HIBERNATE_CALLBACKS
22 bool 89 bool
23 90
@@ -27,7 +94,6 @@ config HIBERNATION
27 select HIBERNATE_CALLBACKS 94 select HIBERNATE_CALLBACKS
28 select LZO_COMPRESS 95 select LZO_COMPRESS
29 select LZO_DECOMPRESS 96 select LZO_DECOMPRESS
30 select CRC32
31 ---help--- 97 ---help---
32 Enable the suspend to disk (STD) functionality, which is usually 98 Enable the suspend to disk (STD) functionality, which is usually
33 called "hibernation" in user interfaces. STD checkpoints the 99 called "hibernation" in user interfaces. STD checkpoints the
@@ -66,9 +132,6 @@ config HIBERNATION
66 132
67 For more information take a look at <file:Documentation/power/swsusp.txt>. 133 For more information take a look at <file:Documentation/power/swsusp.txt>.
68 134
69config ARCH_SAVE_PAGE_KEYS
70 bool
71
72config PM_STD_PARTITION 135config PM_STD_PARTITION
73 string "Default resume partition" 136 string "Default resume partition"
74 depends on HIBERNATION 137 depends on HIBERNATION
@@ -103,33 +166,6 @@ config PM_SLEEP_SMP
103 select HOTPLUG 166 select HOTPLUG
104 select HOTPLUG_CPU 167 select HOTPLUG_CPU
105 168
106config PM_AUTOSLEEP
107 bool "Opportunistic sleep"
108 depends on PM_SLEEP
109 default n
110 ---help---
111 Allow the kernel to trigger a system transition into a global sleep
112 state automatically whenever there are no active wakeup sources.
113
114config PM_WAKELOCKS
115 bool "User space wakeup sources interface"
116 depends on PM_SLEEP
117 default n
118 ---help---
119 Allow user space to create, activate and deactivate wakeup source
120 objects with the help of a sysfs-based interface.
121
122config PM_WAKELOCKS_LIMIT
123 int "Maximum number of user space wakeup sources (0 = no limit)"
124 range 0 100000
125 default 100
126 depends on PM_WAKELOCKS
127
128config PM_WAKELOCKS_GC
129 bool "Garbage collector for user space wakeup sources"
130 depends on PM_WAKELOCKS
131 default y
132
133config PM_RUNTIME 169config PM_RUNTIME
134 bool "Run-time PM core functionality" 170 bool "Run-time PM core functionality"
135 depends on !IA64_HP_SIM 171 depends on !IA64_HP_SIM
@@ -175,7 +211,7 @@ config PM_TEST_SUSPEND
175 You probably want to have your system's RTC driver statically 211 You probably want to have your system's RTC driver statically
176 linked, ensuring that it's available when this test runs. 212 linked, ensuring that it's available when this test runs.
177 213
178config PM_SLEEP_DEBUG 214config CAN_PM_TRACE
179 def_bool y 215 def_bool y
180 depends on PM_DEBUG && PM_SLEEP 216 depends on PM_DEBUG && PM_SLEEP
181 217
@@ -196,7 +232,7 @@ config PM_TRACE
196 232
197config PM_TRACE_RTC 233config PM_TRACE_RTC
198 bool "Suspend/resume event tracing" 234 bool "Suspend/resume event tracing"
199 depends on PM_SLEEP_DEBUG 235 depends on CAN_PM_TRACE
200 depends on X86 236 depends on X86
201 select PM_TRACE 237 select PM_TRACE
202 ---help--- 238 ---help---
@@ -263,14 +299,13 @@ config PM_GENERIC_DOMAINS
263 bool 299 bool
264 depends on PM 300 depends on PM
265 301
266config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS
269
270config PM_GENERIC_DOMAINS_RUNTIME 302config PM_GENERIC_DOMAINS_RUNTIME
271 def_bool y 303 def_bool y
272 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 304 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
273 305
274config CPU_PM 306config SUSPEND_TIME
275 bool 307 bool "Log time spent in suspend"
276 depends on SUSPEND || CPU_IDLE 308 ---help---
309 Prints the time spent in suspend in the kernel log, and
310 keeps statistics on the time spent in suspend in
311 /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11e..9b224e16b19 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,15 +1,18 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-y += qos.o
5obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
7obj-$(CONFIG_FREEZER) += process.o 6obj-$(CONFIG_FREEZER) += process.o
8obj-$(CONFIG_SUSPEND) += suspend.o 7obj-$(CONFIG_SUSPEND) += suspend.o
9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 9obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
11 block_io.o 10 block_io.o
12obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o 11obj-$(CONFIG_WAKELOCK) += wakelock.o
13obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o 12obj-$(CONFIG_USER_WAKELOCK) += userwakelock.o
13obj-$(CONFIG_EARLYSUSPEND) += earlysuspend.o
14obj-$(CONFIG_CONSOLE_EARLYSUSPEND) += consoleearlysuspend.o
15obj-$(CONFIG_FB_EARLYSUSPEND) += fbearlysuspend.o
16obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o
14 17
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 18obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
deleted file mode 100644
index ca304046d9e..00000000000
--- a/kernel/power/autosleep.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * kernel/power/autosleep.c
3 *
4 * Opportunistic sleep support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/device.h>
10#include <linux/mutex.h>
11#include <linux/pm_wakeup.h>
12
13#include "power.h"
14
15static suspend_state_t autosleep_state;
16static struct workqueue_struct *autosleep_wq;
17/*
18 * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
19 * is active, otherwise a deadlock with try_to_suspend() is possible.
20 * Alternatively mutex_lock_interruptible() can be used. This will then fail
21 * if an auto_sleep cycle tries to freeze processes.
22 */
23static DEFINE_MUTEX(autosleep_lock);
24static struct wakeup_source *autosleep_ws;
25
26static void try_to_suspend(struct work_struct *work)
27{
28 unsigned int initial_count, final_count;
29
30 if (!pm_get_wakeup_count(&initial_count, true))
31 goto out;
32
33 mutex_lock(&autosleep_lock);
34
35 if (!pm_save_wakeup_count(initial_count)) {
36 mutex_unlock(&autosleep_lock);
37 goto out;
38 }
39
40 if (autosleep_state == PM_SUSPEND_ON) {
41 mutex_unlock(&autosleep_lock);
42 return;
43 }
44 if (autosleep_state >= PM_SUSPEND_MAX)
45 hibernate();
46 else
47 pm_suspend(autosleep_state);
48
49 mutex_unlock(&autosleep_lock);
50
51 if (!pm_get_wakeup_count(&final_count, false))
52 goto out;
53
54 /*
55 * If the wakeup occured for an unknown reason, wait to prevent the
56 * system from trying to suspend and waking up in a tight loop.
57 */
58 if (final_count == initial_count)
59 schedule_timeout_uninterruptible(HZ / 2);
60
61 out:
62 queue_up_suspend_work();
63}
64
65static DECLARE_WORK(suspend_work, try_to_suspend);
66
67void queue_up_suspend_work(void)
68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work);
71}
72
73suspend_state_t pm_autosleep_state(void)
74{
75 return autosleep_state;
76}
77
78int pm_autosleep_lock(void)
79{
80 return mutex_lock_interruptible(&autosleep_lock);
81}
82
83void pm_autosleep_unlock(void)
84{
85 mutex_unlock(&autosleep_lock);
86}
87
88int pm_autosleep_set_state(suspend_state_t state)
89{
90
91#ifndef CONFIG_HIBERNATION
92 if (state >= PM_SUSPEND_MAX)
93 return -EINVAL;
94#endif
95
96 __pm_stay_awake(autosleep_ws);
97
98 mutex_lock(&autosleep_lock);
99
100 autosleep_state = state;
101
102 __pm_relax(autosleep_ws);
103
104 if (state > PM_SUSPEND_ON) {
105 pm_wakep_autosleep_enabled(true);
106 queue_up_suspend_work();
107 } else {
108 pm_wakep_autosleep_enabled(false);
109 }
110
111 mutex_unlock(&autosleep_lock);
112 return 0;
113}
114
115int __init pm_autosleep_init(void)
116{
117 autosleep_ws = wakeup_source_register("autosleep");
118 if (!autosleep_ws)
119 return -ENOMEM;
120
121 autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
122 if (autosleep_wq)
123 return 0;
124
125 wakeup_source_unregister(autosleep_ws);
126 return -ENOMEM;
127}
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b..218e5af9015 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Functions for saving/restoring console. 2 * drivers/power/process.c - Functions for saving/restoring console.
3 * 3 *
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
@@ -31,3 +32,4 @@ void pm_restore_console(void)
31 vt_kmsg_redirect(orig_kmsg); 32 vt_kmsg_redirect(orig_kmsg);
32 } 33 }
33} 34}
35#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773..8f7b1db1ece 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,18 +5,16 @@
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
9 * 8 *
10 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
11 */ 10 */
12 11
13#include <linux/export.h>
14#include <linux/suspend.h> 12#include <linux/suspend.h>
15#include <linux/syscalls.h> 13#include <linux/syscalls.h>
16#include <linux/reboot.h> 14#include <linux/reboot.h>
17#include <linux/string.h> 15#include <linux/string.h>
18#include <linux/device.h> 16#include <linux/device.h>
19#include <linux/async.h> 17#include <linux/kmod.h>
20#include <linux/delay.h> 18#include <linux/delay.h>
21#include <linux/fs.h> 19#include <linux/fs.h>
22#include <linux/mount.h> 20#include <linux/mount.h>
@@ -26,29 +24,25 @@
26#include <linux/freezer.h> 24#include <linux/freezer.h>
27#include <linux/gfp.h> 25#include <linux/gfp.h>
28#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 27#include <scsi/scsi_scan.h>
30#include <linux/genhd.h>
31 28
32#include "power.h" 29#include "power.h"
33 30
34 31
35static int nocompress; 32static int nocompress = 0;
36static int noresume; 33static int noresume = 0;
37static int resume_wait;
38static int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 34static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 35dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 36sector_t swsusp_resume_block;
42int in_suspend __nosavedata; 37int in_suspend __nosavedata = 0;
43 38
44enum { 39enum {
45 HIBERNATION_INVALID, 40 HIBERNATION_INVALID,
46 HIBERNATION_PLATFORM, 41 HIBERNATION_PLATFORM,
42 HIBERNATION_TEST,
43 HIBERNATION_TESTPROC,
47 HIBERNATION_SHUTDOWN, 44 HIBERNATION_SHUTDOWN,
48 HIBERNATION_REBOOT, 45 HIBERNATION_REBOOT,
49#ifdef CONFIG_SUSPEND
50 HIBERNATION_SUSPEND,
51#endif
52 /* keep last */ 46 /* keep last */
53 __HIBERNATION_AFTER_LAST 47 __HIBERNATION_AFTER_LAST
54}; 48};
@@ -57,8 +51,6 @@ enum {
57 51
58static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
59 53
60bool freezer_test_done;
61
62static const struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
63 55
64/** 56/**
@@ -73,14 +65,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
73 WARN_ON(1); 65 WARN_ON(1);
74 return; 66 return;
75 } 67 }
76 lock_system_sleep(); 68 mutex_lock(&pm_mutex);
77 hibernation_ops = ops; 69 hibernation_ops = ops;
78 if (ops) 70 if (ops)
79 hibernation_mode = HIBERNATION_PLATFORM; 71 hibernation_mode = HIBERNATION_PLATFORM;
80 else if (hibernation_mode == HIBERNATION_PLATFORM) 72 else if (hibernation_mode == HIBERNATION_PLATFORM)
81 hibernation_mode = HIBERNATION_SHUTDOWN; 73 hibernation_mode = HIBERNATION_SHUTDOWN;
82 74
83 unlock_system_sleep(); 75 mutex_unlock(&pm_mutex);
84} 76}
85 77
86static bool entering_platform_hibernation; 78static bool entering_platform_hibernation;
@@ -98,6 +90,15 @@ static void hibernation_debug_sleep(void)
98 mdelay(5000); 90 mdelay(5000);
99} 91}
100 92
93static int hibernation_testmode(int mode)
94{
95 if (hibernation_mode == mode) {
96 hibernation_debug_sleep();
97 return 1;
98 }
99 return 0;
100}
101
101static int hibernation_test(int level) 102static int hibernation_test(int level)
102{ 103{
103 if (pm_test_level == level) { 104 if (pm_test_level == level) {
@@ -107,6 +108,7 @@ static int hibernation_test(int level)
107 return 0; 108 return 0;
108} 109}
109#else /* !CONFIG_PM_DEBUG */ 110#else /* !CONFIG_PM_DEBUG */
111static int hibernation_testmode(int mode) { return 0; }
110static int hibernation_test(int level) { return 0; } 112static int hibernation_test(int level) { return 0; }
111#endif /* !CONFIG_PM_DEBUG */ 113#endif /* !CONFIG_PM_DEBUG */
112 114
@@ -249,8 +251,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
249 * create_image - Create a hibernation image. 251 * create_image - Create a hibernation image.
250 * @platform_mode: Whether or not to use the platform driver. 252 * @platform_mode: Whether or not to use the platform driver.
251 * 253 *
252 * Execute device drivers' "late" and "noirq" freeze callbacks, create a 254 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
253 * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. 255 * and execute the drivers' .thaw_noirq() callbacks.
254 * 256 *
255 * Control reappears in this routine after the subsequent restore. 257 * Control reappears in this routine after the subsequent restore.
256 */ 258 */
@@ -258,7 +260,7 @@ static int create_image(int platform_mode)
258{ 260{
259 int error; 261 int error;
260 262
261 error = dpm_suspend_end(PMSG_FREEZE); 263 error = dpm_suspend_noirq(PMSG_FREEZE);
262 if (error) { 264 if (error) {
263 printk(KERN_ERR "PM: Some devices failed to power down, " 265 printk(KERN_ERR "PM: Some devices failed to power down, "
264 "aborting hibernation\n"); 266 "aborting hibernation\n");
@@ -270,7 +272,8 @@ static int create_image(int platform_mode)
270 goto Platform_finish; 272 goto Platform_finish;
271 273
272 error = disable_nonboot_cpus(); 274 error = disable_nonboot_cpus();
273 if (error || hibernation_test(TEST_CPUS)) 275 if (error || hibernation_test(TEST_CPUS)
276 || hibernation_testmode(HIBERNATION_TEST))
274 goto Enable_cpus; 277 goto Enable_cpus;
275 278
276 local_irq_disable(); 279 local_irq_disable();
@@ -310,7 +313,7 @@ static int create_image(int platform_mode)
310 Platform_finish: 313 Platform_finish:
311 platform_finish(platform_mode); 314 platform_finish(platform_mode);
312 315
313 dpm_resume_start(in_suspend ? 316 dpm_resume_noirq(in_suspend ?
314 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 317 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
315 318
316 return error; 319 return error;
@@ -324,55 +327,38 @@ static int create_image(int platform_mode)
324 */ 327 */
325int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
326{ 329{
327 pm_message_t msg; 330 pm_message_t msg = PMSG_RECOVER;
328 int error; 331 int error;
329 332
330 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
331 if (error) 334 if (error)
332 goto Close; 335 goto Close;
333 336
334 /* Preallocate image memory before shutting down devices. */ 337 error = dpm_prepare(PMSG_FREEZE);
335 error = hibernate_preallocate_memory();
336 if (error) 338 if (error)
337 goto Close; 339 goto Complete_devices;
338 340
339 error = freeze_kernel_threads(); 341 /* Preallocate image memory before shutting down devices. */
342 error = hibernate_preallocate_memory();
340 if (error) 343 if (error)
341 goto Cleanup; 344 goto Complete_devices;
342
343 if (hibernation_test(TEST_FREEZER)) {
344
345 /*
346 * Indicate to the caller that we are returning due to a
347 * successful freezer test.
348 */
349 freezer_test_done = true;
350 goto Thaw;
351 }
352
353 error = dpm_prepare(PMSG_FREEZE);
354 if (error) {
355 dpm_complete(PMSG_RECOVER);
356 goto Thaw;
357 }
358 345
359 suspend_console(); 346 suspend_console();
360 ftrace_stop();
361 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
362
363 error = dpm_suspend(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
349 if (error)
350 goto Recover_platform;
364 351
365 if (error || hibernation_test(TEST_DEVICES)) 352 if (hibernation_test(TEST_DEVICES))
366 platform_recover(platform_mode); 353 goto Recover_platform;
367 else
368 error = create_image(platform_mode);
369 354
355 error = create_image(platform_mode);
370 /* 356 /*
371 * In the case that we call create_image() above, the control 357 * Control returns here (1) after the image has been created or the
372 * returns here (1) after the image has been created or the
373 * image creation has failed and (2) after a successful restore. 358 * image creation has failed and (2) after a successful restore.
374 */ 359 */
375 360
361 Resume_devices:
376 /* We may need to release the preallocated image pages here. */ 362 /* We may need to release the preallocated image pages here. */
377 if (error || !in_suspend) 363 if (error || !in_suspend)
378 swsusp_free(); 364 swsusp_free();
@@ -383,35 +369,34 @@ int hibernation_snapshot(int platform_mode)
383 if (error || !in_suspend) 369 if (error || !in_suspend)
384 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
385 371
386 ftrace_start();
387 resume_console(); 372 resume_console();
373
374 Complete_devices:
388 dpm_complete(msg); 375 dpm_complete(msg);
389 376
390 Close: 377 Close:
391 platform_end(platform_mode); 378 platform_end(platform_mode);
392 return error; 379 return error;
393 380
394 Thaw: 381 Recover_platform:
395 thaw_kernel_threads(); 382 platform_recover(platform_mode);
396 Cleanup: 383 goto Resume_devices;
397 swsusp_free();
398 goto Close;
399} 384}
400 385
401/** 386/**
402 * resume_target_kernel - Restore system state from a hibernation image. 387 * resume_target_kernel - Restore system state from a hibernation image.
403 * @platform_mode: Whether or not to use the platform driver. 388 * @platform_mode: Whether or not to use the platform driver.
404 * 389 *
405 * Execute device drivers' "noirq" and "late" freeze callbacks, restore the 390 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
406 * contents of highmem that have not been restored yet from the image and run 391 * highmem that have not been restored yet from the image and run the low-level
407 * the low-level code that will restore the remaining contents of memory and 392 * code that will restore the remaining contents of memory and switch to the
408 * switch to the just restored target kernel. 393 * just restored target kernel.
409 */ 394 */
410static int resume_target_kernel(bool platform_mode) 395static int resume_target_kernel(bool platform_mode)
411{ 396{
412 int error; 397 int error;
413 398
414 error = dpm_suspend_end(PMSG_QUIESCE); 399 error = dpm_suspend_noirq(PMSG_QUIESCE);
415 if (error) { 400 if (error) {
416 printk(KERN_ERR "PM: Some devices failed to power down, " 401 printk(KERN_ERR "PM: Some devices failed to power down, "
417 "aborting resume\n"); 402 "aborting resume\n");
@@ -468,7 +453,7 @@ static int resume_target_kernel(bool platform_mode)
468 Cleanup: 453 Cleanup:
469 platform_restore_cleanup(platform_mode); 454 platform_restore_cleanup(platform_mode);
470 455
471 dpm_resume_start(PMSG_RECOVER); 456 dpm_resume_noirq(PMSG_RECOVER);
472 457
473 return error; 458 return error;
474} 459}
@@ -478,7 +463,7 @@ static int resume_target_kernel(bool platform_mode)
478 * @platform_mode: If set, use platform driver to prepare for the transition. 463 * @platform_mode: If set, use platform driver to prepare for the transition.
479 * 464 *
480 * This routine must be called with pm_mutex held. If it is successful, control 465 * This routine must be called with pm_mutex held. If it is successful, control
481 * reappears in the restored target kernel in hibernation_snapshot(). 466 * reappears in the restored target kernel in hibernation_snaphot().
482 */ 467 */
483int hibernation_restore(int platform_mode) 468int hibernation_restore(int platform_mode)
484{ 469{
@@ -486,7 +471,6 @@ int hibernation_restore(int platform_mode)
486 471
487 pm_prepare_console(); 472 pm_prepare_console();
488 suspend_console(); 473 suspend_console();
489 ftrace_stop();
490 pm_restrict_gfp_mask(); 474 pm_restrict_gfp_mask();
491 error = dpm_suspend_start(PMSG_QUIESCE); 475 error = dpm_suspend_start(PMSG_QUIESCE);
492 if (!error) { 476 if (!error) {
@@ -494,7 +478,6 @@ int hibernation_restore(int platform_mode)
494 dpm_resume_end(PMSG_RECOVER); 478 dpm_resume_end(PMSG_RECOVER);
495 } 479 }
496 pm_restore_gfp_mask(); 480 pm_restore_gfp_mask();
497 ftrace_start();
498 resume_console(); 481 resume_console();
499 pm_restore_console(); 482 pm_restore_console();
500 return error; 483 return error;
@@ -521,7 +504,6 @@ int hibernation_platform_enter(void)
521 504
522 entering_platform_hibernation = true; 505 entering_platform_hibernation = true;
523 suspend_console(); 506 suspend_console();
524 ftrace_stop();
525 error = dpm_suspend_start(PMSG_HIBERNATE); 507 error = dpm_suspend_start(PMSG_HIBERNATE);
526 if (error) { 508 if (error) {
527 if (hibernation_ops->recover) 509 if (hibernation_ops->recover)
@@ -529,7 +511,7 @@ int hibernation_platform_enter(void)
529 goto Resume_devices; 511 goto Resume_devices;
530 } 512 }
531 513
532 error = dpm_suspend_end(PMSG_HIBERNATE); 514 error = dpm_suspend_noirq(PMSG_HIBERNATE);
533 if (error) 515 if (error)
534 goto Resume_devices; 516 goto Resume_devices;
535 517
@@ -560,12 +542,11 @@ int hibernation_platform_enter(void)
560 Platform_finish: 542 Platform_finish:
561 hibernation_ops->finish(); 543 hibernation_ops->finish();
562 544
563 dpm_resume_start(PMSG_RESTORE); 545 dpm_resume_noirq(PMSG_RESTORE);
564 546
565 Resume_devices: 547 Resume_devices:
566 entering_platform_hibernation = false; 548 entering_platform_hibernation = false;
567 dpm_resume_end(PMSG_RESTORE); 549 dpm_resume_end(PMSG_RESTORE);
568 ftrace_start();
569 resume_console(); 550 resume_console();
570 551
571 Close: 552 Close:
@@ -583,11 +564,10 @@ int hibernation_platform_enter(void)
583 */ 564 */
584static void power_down(void) 565static void power_down(void)
585{ 566{
586#ifdef CONFIG_SUSPEND
587 int error;
588#endif
589
590 switch (hibernation_mode) { 567 switch (hibernation_mode) {
568 case HIBERNATION_TEST:
569 case HIBERNATION_TESTPROC:
570 break;
591 case HIBERNATION_REBOOT: 571 case HIBERNATION_REBOOT:
592 kernel_restart(NULL); 572 kernel_restart(NULL);
593 break; 573 break;
@@ -596,25 +576,6 @@ static void power_down(void)
596 case HIBERNATION_SHUTDOWN: 576 case HIBERNATION_SHUTDOWN:
597 kernel_power_off(); 577 kernel_power_off();
598 break; 578 break;
599#ifdef CONFIG_SUSPEND
600 case HIBERNATION_SUSPEND:
601 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
602 if (error) {
603 if (hibernation_ops)
604 hibernation_mode = HIBERNATION_PLATFORM;
605 else
606 hibernation_mode = HIBERNATION_SHUTDOWN;
607 power_down();
608 }
609 /*
610 * Restore swap signature.
611 */
612 error = swsusp_unmark();
613 if (error)
614 printk(KERN_ERR "PM: Swap will be unusable! "
615 "Try swapon -a.\n");
616 return;
617#endif
618 } 579 }
619 kernel_halt(); 580 kernel_halt();
620 /* 581 /*
@@ -625,6 +586,17 @@ static void power_down(void)
625 while(1); 586 while(1);
626} 587}
627 588
589static int prepare_processes(void)
590{
591 int error = 0;
592
593 if (freeze_processes()) {
594 error = -EBUSY;
595 thaw_processes();
596 }
597 return error;
598}
599
628/** 600/**
629 * hibernate - Carry out system hibernation, including saving the image. 601 * hibernate - Carry out system hibernation, including saving the image.
630 */ 602 */
@@ -632,7 +604,7 @@ int hibernate(void)
632{ 604{
633 int error; 605 int error;
634 606
635 lock_system_sleep(); 607 mutex_lock(&pm_mutex);
636 /* The snapshot device should not be opened while we're running */ 608 /* The snapshot device should not be opened while we're running */
637 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 609 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
638 error = -EBUSY; 610 error = -EBUSY;
@@ -644,6 +616,10 @@ int hibernate(void)
644 if (error) 616 if (error)
645 goto Exit; 617 goto Exit;
646 618
619 error = usermodehelper_disable();
620 if (error)
621 goto Exit;
622
647 /* Allocate memory management structures */ 623 /* Allocate memory management structures */
648 error = create_basic_memory_bitmaps(); 624 error = create_basic_memory_bitmaps();
649 if (error) 625 if (error)
@@ -653,12 +629,18 @@ int hibernate(void)
653 sys_sync(); 629 sys_sync();
654 printk("done.\n"); 630 printk("done.\n");
655 631
656 error = freeze_processes(); 632 error = prepare_processes();
657 if (error) 633 if (error)
658 goto Free_bitmaps; 634 goto Finish;
635
636 if (hibernation_test(TEST_FREEZER))
637 goto Thaw;
638
639 if (hibernation_testmode(HIBERNATION_TESTPROC))
640 goto Thaw;
659 641
660 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 642 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
661 if (error || freezer_test_done) 643 if (error)
662 goto Thaw; 644 goto Thaw;
663 645
664 if (in_suspend) { 646 if (in_suspend) {
@@ -668,9 +650,6 @@ int hibernate(void)
668 flags |= SF_PLATFORM_MODE; 650 flags |= SF_PLATFORM_MODE;
669 if (nocompress) 651 if (nocompress)
670 flags |= SF_NOCOMPRESS_MODE; 652 flags |= SF_NOCOMPRESS_MODE;
671 else
672 flags |= SF_CRC32_MODE;
673
674 pr_debug("PM: writing image.\n"); 653 pr_debug("PM: writing image.\n");
675 error = swsusp_write(flags); 654 error = swsusp_write(flags);
676 swsusp_free(); 655 swsusp_free();
@@ -684,18 +663,15 @@ int hibernate(void)
684 663
685 Thaw: 664 Thaw:
686 thaw_processes(); 665 thaw_processes();
687 666 Finish:
688 /* Don't bother checking whether freezer_test_done is true */
689 freezer_test_done = false;
690
691 Free_bitmaps:
692 free_basic_memory_bitmaps(); 667 free_basic_memory_bitmaps();
668 usermodehelper_enable();
693 Exit: 669 Exit:
694 pm_notifier_call_chain(PM_POST_HIBERNATION); 670 pm_notifier_call_chain(PM_POST_HIBERNATION);
695 pm_restore_console(); 671 pm_restore_console();
696 atomic_inc(&snapshot_device_available); 672 atomic_inc(&snapshot_device_available);
697 Unlock: 673 Unlock:
698 unlock_system_sleep(); 674 mutex_unlock(&pm_mutex);
699 return error; 675 return error;
700} 676}
701 677
@@ -748,37 +724,20 @@ static int software_resume(void)
748 724
749 pr_debug("PM: Checking hibernation image partition %s\n", resume_file); 725 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
750 726
751 if (resume_delay) {
752 printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
753 resume_delay);
754 ssleep(resume_delay);
755 }
756
757 /* Check if the device is there */ 727 /* Check if the device is there */
758 swsusp_resume_device = name_to_dev_t(resume_file); 728 swsusp_resume_device = name_to_dev_t(resume_file);
759
760 /*
761 * name_to_dev_t is ineffective to verify parition if resume_file is in
762 * integer format. (e.g. major:minor)
763 */
764 if (isdigit(resume_file[0]) && resume_wait) {
765 int partno;
766 while (!get_gendisk(swsusp_resume_device, &partno))
767 msleep(10);
768 }
769
770 if (!swsusp_resume_device) { 729 if (!swsusp_resume_device) {
771 /* 730 /*
772 * Some device discovery might still be in progress; we need 731 * Some device discovery might still be in progress; we need
773 * to wait for this to finish. 732 * to wait for this to finish.
774 */ 733 */
775 wait_for_device_probe(); 734 wait_for_device_probe();
776 735 /*
777 if (resume_wait) { 736 * We can't depend on SCSI devices being available after loading
778 while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) 737 * one of their modules until scsi_complete_async_scans() is
779 msleep(10); 738 * called and the resume device usually is a SCSI one.
780 async_synchronize_full(); 739 */
781 } 740 scsi_complete_async_scans();
782 741
783 swsusp_resume_device = name_to_dev_t(resume_file); 742 swsusp_resume_device = name_to_dev_t(resume_file);
784 if (!swsusp_resume_device) { 743 if (!swsusp_resume_device) {
@@ -808,12 +767,16 @@ static int software_resume(void)
808 if (error) 767 if (error)
809 goto close_finish; 768 goto close_finish;
810 769
770 error = usermodehelper_disable();
771 if (error)
772 goto close_finish;
773
811 error = create_basic_memory_bitmaps(); 774 error = create_basic_memory_bitmaps();
812 if (error) 775 if (error)
813 goto close_finish; 776 goto close_finish;
814 777
815 pr_debug("PM: Preparing processes for restore.\n"); 778 pr_debug("PM: Preparing processes for restore.\n");
816 error = freeze_processes(); 779 error = prepare_processes();
817 if (error) { 780 if (error) {
818 swsusp_close(FMODE_READ); 781 swsusp_close(FMODE_READ);
819 goto Done; 782 goto Done;
@@ -831,6 +794,7 @@ static int software_resume(void)
831 thaw_processes(); 794 thaw_processes();
832 Done: 795 Done:
833 free_basic_memory_bitmaps(); 796 free_basic_memory_bitmaps();
797 usermodehelper_enable();
834 Finish: 798 Finish:
835 pm_notifier_call_chain(PM_POST_RESTORE); 799 pm_notifier_call_chain(PM_POST_RESTORE);
836 pm_restore_console(); 800 pm_restore_console();
@@ -852,9 +816,8 @@ static const char * const hibernation_modes[] = {
852 [HIBERNATION_PLATFORM] = "platform", 816 [HIBERNATION_PLATFORM] = "platform",
853 [HIBERNATION_SHUTDOWN] = "shutdown", 817 [HIBERNATION_SHUTDOWN] = "shutdown",
854 [HIBERNATION_REBOOT] = "reboot", 818 [HIBERNATION_REBOOT] = "reboot",
855#ifdef CONFIG_SUSPEND 819 [HIBERNATION_TEST] = "test",
856 [HIBERNATION_SUSPEND] = "suspend", 820 [HIBERNATION_TESTPROC] = "testproc",
857#endif
858}; 821};
859 822
860/* 823/*
@@ -863,15 +826,17 @@ static const char * const hibernation_modes[] = {
863 * Hibernation can be handled in several ways. There are a few different ways 826 * Hibernation can be handled in several ways. There are a few different ways
864 * to put the system into the sleep state: using the platform driver (e.g. ACPI 827 * to put the system into the sleep state: using the platform driver (e.g. ACPI
865 * or other hibernation_ops), powering it off or rebooting it (for testing 828 * or other hibernation_ops), powering it off or rebooting it (for testing
866 * mostly). 829 * mostly), or using one of the two available test modes.
867 * 830 *
868 * The sysfs file /sys/power/disk provides an interface for selecting the 831 * The sysfs file /sys/power/disk provides an interface for selecting the
869 * hibernation mode to use. Reading from this file causes the available modes 832 * hibernation mode to use. Reading from this file causes the available modes
870 * to be printed. There are 3 modes that can be supported: 833 * to be printed. There are 5 modes that can be supported:
871 * 834 *
872 * 'platform' 835 * 'platform'
873 * 'shutdown' 836 * 'shutdown'
874 * 'reboot' 837 * 'reboot'
838 * 'test'
839 * 'testproc'
875 * 840 *
876 * If a platform hibernation driver is in use, 'platform' will be supported 841 * If a platform hibernation driver is in use, 'platform' will be supported
877 * and will be used by default. Otherwise, 'shutdown' will be used by default. 842 * and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -895,9 +860,8 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
895 switch (i) { 860 switch (i) {
896 case HIBERNATION_SHUTDOWN: 861 case HIBERNATION_SHUTDOWN:
897 case HIBERNATION_REBOOT: 862 case HIBERNATION_REBOOT:
898#ifdef CONFIG_SUSPEND 863 case HIBERNATION_TEST:
899 case HIBERNATION_SUSPEND: 864 case HIBERNATION_TESTPROC:
900#endif
901 break; 865 break;
902 case HIBERNATION_PLATFORM: 866 case HIBERNATION_PLATFORM:
903 if (hibernation_ops) 867 if (hibernation_ops)
@@ -926,7 +890,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
926 p = memchr(buf, '\n', n); 890 p = memchr(buf, '\n', n);
927 len = p ? p - buf : n; 891 len = p ? p - buf : n;
928 892
929 lock_system_sleep(); 893 mutex_lock(&pm_mutex);
930 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 894 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
931 if (len == strlen(hibernation_modes[i]) 895 if (len == strlen(hibernation_modes[i])
932 && !strncmp(buf, hibernation_modes[i], len)) { 896 && !strncmp(buf, hibernation_modes[i], len)) {
@@ -938,9 +902,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
938 switch (mode) { 902 switch (mode) {
939 case HIBERNATION_SHUTDOWN: 903 case HIBERNATION_SHUTDOWN:
940 case HIBERNATION_REBOOT: 904 case HIBERNATION_REBOOT:
941#ifdef CONFIG_SUSPEND 905 case HIBERNATION_TEST:
942 case HIBERNATION_SUSPEND: 906 case HIBERNATION_TESTPROC:
943#endif
944 hibernation_mode = mode; 907 hibernation_mode = mode;
945 break; 908 break;
946 case HIBERNATION_PLATFORM: 909 case HIBERNATION_PLATFORM:
@@ -955,7 +918,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
955 if (!error) 918 if (!error)
956 pr_debug("PM: Hibernation mode set to '%s'\n", 919 pr_debug("PM: Hibernation mode set to '%s'\n",
957 hibernation_modes[mode]); 920 hibernation_modes[mode]);
958 unlock_system_sleep(); 921 mutex_unlock(&pm_mutex);
959 return error ? error : n; 922 return error ? error : n;
960} 923}
961 924
@@ -982,9 +945,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
982 if (maj != MAJOR(res) || min != MINOR(res)) 945 if (maj != MAJOR(res) || min != MINOR(res))
983 goto out; 946 goto out;
984 947
985 lock_system_sleep(); 948 mutex_lock(&pm_mutex);
986 swsusp_resume_device = res; 949 swsusp_resume_device = res;
987 unlock_system_sleep(); 950 mutex_unlock(&pm_mutex);
988 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 951 printk(KERN_INFO "PM: Starting manual resume from disk\n");
989 noresume = 0; 952 noresume = 0;
990 software_resume(); 953 software_resume();
@@ -1097,21 +1060,7 @@ static int __init noresume_setup(char *str)
1097 return 1; 1060 return 1;
1098} 1061}
1099 1062
1100static int __init resumewait_setup(char *str)
1101{
1102 resume_wait = 1;
1103 return 1;
1104}
1105
1106static int __init resumedelay_setup(char *str)
1107{
1108 resume_delay = simple_strtoul(str, NULL, 0);
1109 return 1;
1110}
1111
1112__setup("noresume", noresume_setup); 1063__setup("noresume", noresume_setup);
1113__setup("resume_offset=", resume_offset_setup); 1064__setup("resume_offset=", resume_offset_setup);
1114__setup("resume=", resume_setup); 1065__setup("resume=", resume_setup);
1115__setup("hibernate=", hibernate_setup); 1066__setup("hibernate=", hibernate_setup);
1116__setup("resumewait", resumewait_setup);
1117__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de..3304594553c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,18 +3,15 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * 6 *
7 * This file is released under the GPLv2 7 * This file is released under the GPLv2
8 * 8 *
9 */ 9 */
10 10
11#include <linux/export.h>
12#include <linux/kobject.h> 11#include <linux/kobject.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
15#include <linux/workqueue.h> 14#include <linux/workqueue.h>
16#include <linux/debugfs.h>
17#include <linux/seq_file.h>
18 15
19#include "power.h" 16#include "power.h"
20 17
@@ -59,7 +56,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
59{ 56{
60 unsigned long val; 57 unsigned long val;
61 58
62 if (kstrtoul(buf, 10, &val)) 59 if (strict_strtoul(buf, 10, &val))
63 return -EINVAL; 60 return -EINVAL;
64 61
65 if (val > 1) 62 if (val > 1)
@@ -116,7 +113,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
116 p = memchr(buf, '\n', n); 113 p = memchr(buf, '\n', n);
117 len = p ? p - buf : n; 114 len = p ? p - buf : n;
118 115
119 lock_system_sleep(); 116 mutex_lock(&pm_mutex);
120 117
121 level = TEST_FIRST; 118 level = TEST_FIRST;
122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) 119 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +123,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
126 break; 123 break;
127 } 124 }
128 125
129 unlock_system_sleep(); 126 mutex_unlock(&pm_mutex);
130 127
131 return error ? error : n; 128 return error ? error : n;
132} 129}
@@ -134,148 +131,8 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
134power_attr(pm_test); 131power_attr(pm_test);
135#endif /* CONFIG_PM_DEBUG */ 132#endif /* CONFIG_PM_DEBUG */
136 133
137#ifdef CONFIG_DEBUG_FS
138static char *suspend_step_name(enum suspend_stat_step step)
139{
140 switch (step) {
141 case SUSPEND_FREEZE:
142 return "freeze";
143 case SUSPEND_PREPARE:
144 return "prepare";
145 case SUSPEND_SUSPEND:
146 return "suspend";
147 case SUSPEND_SUSPEND_NOIRQ:
148 return "suspend_noirq";
149 case SUSPEND_RESUME_NOIRQ:
150 return "resume_noirq";
151 case SUSPEND_RESUME:
152 return "resume";
153 default:
154 return "";
155 }
156}
157
158static int suspend_stats_show(struct seq_file *s, void *unused)
159{
160 int i, index, last_dev, last_errno, last_step;
161
162 last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
163 last_dev %= REC_FAILED_NUM;
164 last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_late",
176 suspend_stats.failed_suspend_late,
177 "failed_suspend_noirq",
178 suspend_stats.failed_suspend_noirq,
179 "failed_resume", suspend_stats.failed_resume,
180 "failed_resume_early",
181 suspend_stats.failed_resume_early,
182 "failed_resume_noirq",
183 suspend_stats.failed_resume_noirq);
184 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
185 suspend_stats.failed_devs[last_dev]);
186 for (i = 1; i < REC_FAILED_NUM; i++) {
187 index = last_dev + REC_FAILED_NUM - i;
188 index %= REC_FAILED_NUM;
189 seq_printf(s, "\t\t\t%-s\n",
190 suspend_stats.failed_devs[index]);
191 }
192 seq_printf(s, " last_failed_errno:\t%-d\n",
193 suspend_stats.errno[last_errno]);
194 for (i = 1; i < REC_FAILED_NUM; i++) {
195 index = last_errno + REC_FAILED_NUM - i;
196 index %= REC_FAILED_NUM;
197 seq_printf(s, "\t\t\t%-d\n",
198 suspend_stats.errno[index]);
199 }
200 seq_printf(s, " last_failed_step:\t%-s\n",
201 suspend_step_name(
202 suspend_stats.failed_steps[last_step]));
203 for (i = 1; i < REC_FAILED_NUM; i++) {
204 index = last_step + REC_FAILED_NUM - i;
205 index %= REC_FAILED_NUM;
206 seq_printf(s, "\t\t\t%-s\n",
207 suspend_step_name(
208 suspend_stats.failed_steps[index]));
209 }
210
211 return 0;
212}
213
214static int suspend_stats_open(struct inode *inode, struct file *file)
215{
216 return single_open(file, suspend_stats_show, NULL);
217}
218
219static const struct file_operations suspend_stats_operations = {
220 .open = suspend_stats_open,
221 .read = seq_read,
222 .llseek = seq_lseek,
223 .release = single_release,
224};
225
226static int __init pm_debugfs_init(void)
227{
228 debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
229 NULL, NULL, &suspend_stats_operations);
230 return 0;
231}
232
233late_initcall(pm_debugfs_init);
234#endif /* CONFIG_DEBUG_FS */
235
236#endif /* CONFIG_PM_SLEEP */ 134#endif /* CONFIG_PM_SLEEP */
237 135
238#ifdef CONFIG_PM_SLEEP_DEBUG
239/*
240 * pm_print_times: print time taken by devices to suspend and resume.
241 *
242 * show() returns whether printing of suspend and resume times is enabled.
243 * store() accepts 0 or 1. 0 disables printing and 1 enables it.
244 */
245bool pm_print_times_enabled;
246
247static ssize_t pm_print_times_show(struct kobject *kobj,
248 struct kobj_attribute *attr, char *buf)
249{
250 return sprintf(buf, "%d\n", pm_print_times_enabled);
251}
252
253static ssize_t pm_print_times_store(struct kobject *kobj,
254 struct kobj_attribute *attr,
255 const char *buf, size_t n)
256{
257 unsigned long val;
258
259 if (kstrtoul(buf, 10, &val))
260 return -EINVAL;
261
262 if (val > 1)
263 return -EINVAL;
264
265 pm_print_times_enabled = !!val;
266 return n;
267}
268
269power_attr(pm_print_times);
270
271static inline void pm_print_times_init(void)
272{
273 pm_print_times_enabled = !!initcall_debug;
274}
275#else /* !CONFIG_PP_SLEEP_DEBUG */
276static inline void pm_print_times_init(void) {}
277#endif /* CONFIG_PM_SLEEP_DEBUG */
278
279struct kobject *power_kobj; 136struct kobject *power_kobj;
280 137
281/** 138/**
@@ -285,7 +142,7 @@ struct kobject *power_kobj;
285 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 142 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
286 * 'disk' (Suspend-to-Disk). 143 * 'disk' (Suspend-to-Disk).
287 * 144 *
288 * store() accepts one of those strings, translates it into the 145 * store() accepts one of those strings, translates it into the
289 * proper enumerated value, and initiates a suspend transition. 146 * proper enumerated value, and initiates a suspend transition.
290 */ 147 */
291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 148static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -310,56 +167,47 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
310 return (s - buf); 167 return (s - buf);
311} 168}
312 169
313static suspend_state_t decode_state(const char *buf, size_t n) 170static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
171 const char *buf, size_t n)
314{ 172{
315#ifdef CONFIG_SUSPEND 173#ifdef CONFIG_SUSPEND
174#ifdef CONFIG_EARLYSUSPEND
175 suspend_state_t state = PM_SUSPEND_ON;
176#else
316 suspend_state_t state = PM_SUSPEND_STANDBY; 177 suspend_state_t state = PM_SUSPEND_STANDBY;
178#endif
317 const char * const *s; 179 const char * const *s;
318#endif 180#endif
319 char *p; 181 char *p;
320 int len; 182 int len;
183 int error = -EINVAL;
321 184
322 p = memchr(buf, '\n', n); 185 p = memchr(buf, '\n', n);
323 len = p ? p - buf : n; 186 len = p ? p - buf : n;
324 187
325 /* Check hibernation first. */ 188 /* First, check if we are requested to hibernate */
326 if (len == 4 && !strncmp(buf, "disk", len)) 189 if (len == 4 && !strncmp(buf, "disk", len)) {
327 return PM_SUSPEND_MAX; 190 error = hibernate();
191 goto Exit;
192 }
328 193
329#ifdef CONFIG_SUSPEND 194#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 195 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
331 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 196 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
332 return state; 197 break;
333#endif
334
335 return PM_SUSPEND_ON;
336}
337
338static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
339 const char *buf, size_t n)
340{
341 suspend_state_t state;
342 int error;
343
344 error = pm_autosleep_lock();
345 if (error)
346 return error;
347
348 if (pm_autosleep_state() > PM_SUSPEND_ON) {
349 error = -EBUSY;
350 goto out;
351 } 198 }
199 if (state < PM_SUSPEND_MAX && *s)
200#ifdef CONFIG_EARLYSUSPEND
201 if (state == PM_SUSPEND_ON || valid_state(state)) {
202 error = 0;
203 request_suspend_state(state);
204 }
205#else
206 error = enter_state(state);
207#endif
208#endif
352 209
353 state = decode_state(buf, n); 210 Exit:
354 if (state < PM_SUSPEND_MAX)
355 error = pm_suspend(state);
356 else if (state == PM_SUSPEND_MAX)
357 error = hibernate();
358 else
359 error = -EINVAL;
360
361 out:
362 pm_autosleep_unlock();
363 return error ? error : n; 211 return error ? error : n;
364} 212}
365 213
@@ -400,8 +248,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
400{ 248{
401 unsigned int val; 249 unsigned int val;
402 250
403 return pm_get_wakeup_count(&val, true) ? 251 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
404 sprintf(buf, "%u\n", val) : -EINTR;
405} 252}
406 253
407static ssize_t wakeup_count_store(struct kobject *kobj, 254static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -409,106 +256,15 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
409 const char *buf, size_t n) 256 const char *buf, size_t n)
410{ 257{
411 unsigned int val; 258 unsigned int val;
412 int error;
413
414 error = pm_autosleep_lock();
415 if (error)
416 return error;
417 259
418 if (pm_autosleep_state() > PM_SUSPEND_ON) {
419 error = -EBUSY;
420 goto out;
421 }
422
423 error = -EINVAL;
424 if (sscanf(buf, "%u", &val) == 1) { 260 if (sscanf(buf, "%u", &val) == 1) {
425 if (pm_save_wakeup_count(val)) 261 if (pm_save_wakeup_count(val))
426 error = n; 262 return n;
427 } 263 }
428 264 return -EINVAL;
429 out:
430 pm_autosleep_unlock();
431 return error;
432} 265}
433 266
434power_attr(wakeup_count); 267power_attr(wakeup_count);
435
436#ifdef CONFIG_PM_AUTOSLEEP
437static ssize_t autosleep_show(struct kobject *kobj,
438 struct kobj_attribute *attr,
439 char *buf)
440{
441 suspend_state_t state = pm_autosleep_state();
442
443 if (state == PM_SUSPEND_ON)
444 return sprintf(buf, "off\n");
445
446#ifdef CONFIG_SUSPEND
447 if (state < PM_SUSPEND_MAX)
448 return sprintf(buf, "%s\n", valid_state(state) ?
449 pm_states[state] : "error");
450#endif
451#ifdef CONFIG_HIBERNATION
452 return sprintf(buf, "disk\n");
453#else
454 return sprintf(buf, "error");
455#endif
456}
457
458static ssize_t autosleep_store(struct kobject *kobj,
459 struct kobj_attribute *attr,
460 const char *buf, size_t n)
461{
462 suspend_state_t state = decode_state(buf, n);
463 int error;
464
465 if (state == PM_SUSPEND_ON
466 && strcmp(buf, "off") && strcmp(buf, "off\n"))
467 return -EINVAL;
468
469 error = pm_autosleep_set_state(state);
470 return error ? error : n;
471}
472
473power_attr(autosleep);
474#endif /* CONFIG_PM_AUTOSLEEP */
475
476#ifdef CONFIG_PM_WAKELOCKS
477static ssize_t wake_lock_show(struct kobject *kobj,
478 struct kobj_attribute *attr,
479 char *buf)
480{
481 return pm_show_wakelocks(buf, true);
482}
483
484static ssize_t wake_lock_store(struct kobject *kobj,
485 struct kobj_attribute *attr,
486 const char *buf, size_t n)
487{
488 int error = pm_wake_lock(buf);
489 return error ? error : n;
490}
491
492power_attr(wake_lock);
493
494static ssize_t wake_unlock_show(struct kobject *kobj,
495 struct kobj_attribute *attr,
496 char *buf)
497{
498 return pm_show_wakelocks(buf, false);
499}
500
501static ssize_t wake_unlock_store(struct kobject *kobj,
502 struct kobj_attribute *attr,
503 const char *buf, size_t n)
504{
505 int error = pm_wake_unlock(buf);
506 return error ? error : n;
507}
508
509power_attr(wake_unlock);
510
511#endif /* CONFIG_PM_WAKELOCKS */
512#endif /* CONFIG_PM_SLEEP */ 268#endif /* CONFIG_PM_SLEEP */
513 269
514#ifdef CONFIG_PM_TRACE 270#ifdef CONFIG_PM_TRACE
@@ -553,6 +309,11 @@ power_attr(pm_trace_dev_match);
553 309
554#endif /* CONFIG_PM_TRACE */ 310#endif /* CONFIG_PM_TRACE */
555 311
312#ifdef CONFIG_USER_WAKELOCK
313power_attr(wake_lock);
314power_attr(wake_unlock);
315#endif
316
556static struct attribute * g[] = { 317static struct attribute * g[] = {
557 &state_attr.attr, 318 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 319#ifdef CONFIG_PM_TRACE
@@ -562,18 +323,12 @@ static struct attribute * g[] = {
562#ifdef CONFIG_PM_SLEEP 323#ifdef CONFIG_PM_SLEEP
563 &pm_async_attr.attr, 324 &pm_async_attr.attr,
564 &wakeup_count_attr.attr, 325 &wakeup_count_attr.attr,
565#ifdef CONFIG_PM_AUTOSLEEP
566 &autosleep_attr.attr,
567#endif
568#ifdef CONFIG_PM_WAKELOCKS
569 &wake_lock_attr.attr,
570 &wake_unlock_attr.attr,
571#endif
572#ifdef CONFIG_PM_DEBUG 326#ifdef CONFIG_PM_DEBUG
573 &pm_test_attr.attr, 327 &pm_test_attr.attr,
574#endif 328#endif
575#ifdef CONFIG_PM_SLEEP_DEBUG 329#ifdef CONFIG_USER_WAKELOCK
576 &pm_print_times_attr.attr, 330 &wake_lock_attr.attr,
331 &wake_unlock_attr.attr,
577#endif 332#endif
578#endif 333#endif
579 NULL, 334 NULL,
@@ -607,11 +362,7 @@ static int __init pm_init(void)
607 power_kobj = kobject_create_and_add("power", NULL); 362 power_kobj = kobject_create_and_add("power", NULL);
608 if (!power_kobj) 363 if (!power_kobj)
609 return -ENOMEM; 364 return -ENOMEM;
610 error = sysfs_create_group(power_kobj, &attr_group); 365 return sysfs_create_group(power_kobj, &attr_group);
611 if (error)
612 return error;
613 pm_print_times_init();
614 return pm_autosleep_init();
615} 366}
616 367
617core_initcall(pm_init); 368core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7d4b7ffb3c1..b6b9006480f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,8 +50,6 @@ static inline char *check_image_kernel(struct swsusp_info *info)
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52/* kernel/power/hibernate.c */ 52/* kernel/power/hibernate.c */
53extern bool freezer_test_done;
54
55extern int hibernation_snapshot(int platform_mode); 53extern int hibernation_snapshot(int platform_mode);
56extern int hibernation_restore(int platform_mode); 54extern int hibernation_restore(int platform_mode);
57extern int hibernation_platform_enter(void); 55extern int hibernation_platform_enter(void);
@@ -148,7 +146,6 @@ extern int swsusp_swap_in_use(void);
148 */ 146 */
149#define SF_PLATFORM_MODE 1 147#define SF_PLATFORM_MODE 1
150#define SF_NOCOMPRESS_MODE 2 148#define SF_NOCOMPRESS_MODE 2
151#define SF_CRC32_MODE 4
152 149
153/* kernel/power/hibernate.c */ 150/* kernel/power/hibernate.c */
154extern int swsusp_check(void); 151extern int swsusp_check(void);
@@ -156,9 +153,6 @@ extern void swsusp_free(void);
156extern int swsusp_read(unsigned int *flags_p); 153extern int swsusp_read(unsigned int *flags_p);
157extern int swsusp_write(unsigned int flags); 154extern int swsusp_write(unsigned int flags);
158extern void swsusp_close(fmode_t); 155extern void swsusp_close(fmode_t);
159#ifdef CONFIG_SUSPEND
160extern int swsusp_unmark(void);
161#endif
162 156
163/* kernel/power/block_io.c */ 157/* kernel/power/block_io.c */
164extern struct block_device *hib_resume_bdev; 158extern struct block_device *hib_resume_bdev;
@@ -180,11 +174,13 @@ extern const char *const pm_states[];
180 174
181extern bool valid_state(suspend_state_t state); 175extern bool valid_state(suspend_state_t state);
182extern int suspend_devices_and_enter(suspend_state_t state); 176extern int suspend_devices_and_enter(suspend_state_t state);
177extern int enter_state(suspend_state_t state);
183#else /* !CONFIG_SUSPEND */ 178#else /* !CONFIG_SUSPEND */
184static inline int suspend_devices_and_enter(suspend_state_t state) 179static inline int suspend_devices_and_enter(suspend_state_t state)
185{ 180{
186 return -ENOSYS; 181 return -ENOSYS;
187} 182}
183static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
188static inline bool valid_state(suspend_state_t state) { return false; } 184static inline bool valid_state(suspend_state_t state) { return false; }
189#endif /* !CONFIG_SUSPEND */ 185#endif /* !CONFIG_SUSPEND */
190 186
@@ -232,25 +228,7 @@ extern int pm_test_level;
232#ifdef CONFIG_SUSPEND_FREEZER 228#ifdef CONFIG_SUSPEND_FREEZER
233static inline int suspend_freeze_processes(void) 229static inline int suspend_freeze_processes(void)
234{ 230{
235 int error; 231 return freeze_processes();
236
237 error = freeze_processes();
238 /*
239 * freeze_processes() automatically thaws every task if freezing
240 * fails. So we need not do anything extra upon error.
241 */
242 if (error)
243 return error;
244
245 error = freeze_kernel_threads();
246 /*
247 * freeze_kernel_threads() thaws only kernel threads upon freezing
248 * failure. So we have to thaw the userspace tasks ourselves.
249 */
250 if (error)
251 thaw_processes();
252
253 return error;
254} 232}
255 233
256static inline void suspend_thaw_processes(void) 234static inline void suspend_thaw_processes(void)
@@ -268,29 +246,26 @@ static inline void suspend_thaw_processes(void)
268} 246}
269#endif 247#endif
270 248
271#ifdef CONFIG_PM_AUTOSLEEP 249#ifdef CONFIG_WAKELOCK
272
273/* kernel/power/autosleep.c */
274extern int pm_autosleep_init(void);
275extern int pm_autosleep_lock(void);
276extern void pm_autosleep_unlock(void);
277extern suspend_state_t pm_autosleep_state(void);
278extern int pm_autosleep_set_state(suspend_state_t state);
279
280#else /* !CONFIG_PM_AUTOSLEEP */
281
282static inline int pm_autosleep_init(void) { return 0; }
283static inline int pm_autosleep_lock(void) { return 0; }
284static inline void pm_autosleep_unlock(void) {}
285static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
286
287#endif /* !CONFIG_PM_AUTOSLEEP */
288
289#ifdef CONFIG_PM_WAKELOCKS
290
291/* kernel/power/wakelock.c */ 250/* kernel/power/wakelock.c */
292extern ssize_t pm_show_wakelocks(char *buf, bool show_active); 251extern struct workqueue_struct *suspend_work_queue;
293extern int pm_wake_lock(const char *buf); 252extern struct wake_lock main_wake_lock;
294extern int pm_wake_unlock(const char *buf); 253extern suspend_state_t requested_suspend_state;
254#endif
295 255
296#endif /* !CONFIG_PM_WAKELOCKS */ 256#ifdef CONFIG_USER_WAKELOCK
257ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr,
258 char *buf);
259ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr,
260 const char *buf, size_t n);
261ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr,
262 char *buf);
263ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr,
264 const char *buf, size_t n);
265#endif
266
267#ifdef CONFIG_EARLYSUSPEND
268/* kernel/power/earlysuspend.c */
269void request_suspend_state(suspend_state_t state);
270suspend_state_t get_suspend_state(void);
271#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc..d52359374e8 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int __init pm_sysrq_init(void) 40static int pm_sysrq_init(void)
41{ 41{
42 register_sysrq_key('o', &sysrq_poweroff_op); 42 register_sysrq_key('o', &sysrq_poweroff_op);
43 return 0; 43 return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6..31338cdeafc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,14 +16,23 @@
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/kmod.h> 19#include <linux/wakelock.h>
20 20
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24#define TIMEOUT (20 * HZ)
25 25
26static int try_to_freeze_tasks(bool user_only) 26static inline int freezable(struct task_struct * p)
27{
28 if ((p == current) ||
29 (p->flags & PF_NOFREEZE) ||
30 (p->exit_state != 0))
31 return 0;
32 return 1;
33}
34
35static int try_to_freeze_tasks(bool sig_only)
27{ 36{
28 struct task_struct *g, *p; 37 struct task_struct *g, *p;
29 unsigned long end_time; 38 unsigned long end_time;
@@ -38,26 +47,46 @@ static int try_to_freeze_tasks(bool user_only)
38 47
39 end_time = jiffies + TIMEOUT; 48 end_time = jiffies + TIMEOUT;
40 49
41 if (!user_only) 50 if (!sig_only)
42 freeze_workqueues_begin(); 51 freeze_workqueues_begin();
43 52
44 while (true) { 53 while (true) {
45 todo = 0; 54 todo = 0;
46 read_lock(&tasklist_lock); 55 read_lock(&tasklist_lock);
47 do_each_thread(g, p) { 56 do_each_thread(g, p) {
48 if (p == current || !freeze_task(p)) 57 if (frozen(p) || !freezable(p))
58 continue;
59
60 if (!freeze_task(p, sig_only))
49 continue; 61 continue;
50 62
51 if (!freezer_should_skip(p)) 63 /*
64 * Now that we've done set_freeze_flag, don't
65 * perturb a task in TASK_STOPPED or TASK_TRACED.
66 * It is "frozen enough". If the task does wake
67 * up, it will immediately call try_to_freeze.
68 *
69 * Because freeze_task() goes through p's
70 * scheduler lock after setting TIF_FREEZE, it's
71 * guaranteed that either we see TASK_RUNNING or
72 * try_to_stop() after schedule() in ptrace/signal
73 * stop sees TIF_FREEZE.
74 */
75 if (!task_is_stopped_or_traced(p) &&
76 !freezer_should_skip(p))
52 todo++; 77 todo++;
53 } while_each_thread(g, p); 78 } while_each_thread(g, p);
54 read_unlock(&tasklist_lock); 79 read_unlock(&tasklist_lock);
55 80
56 if (!user_only) { 81 if (!sig_only) {
57 wq_busy = freeze_workqueues_busy(); 82 wq_busy = freeze_workqueues_busy();
58 todo += wq_busy; 83 todo += wq_busy;
59 } 84 }
60 85
86 if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) {
87 wakeup = 1;
88 break;
89 }
61 if (!todo || time_after(jiffies, end_time)) 90 if (!todo || time_after(jiffies, end_time))
62 break; 91 break;
63 92
@@ -68,7 +97,7 @@ static int try_to_freeze_tasks(bool user_only)
68 97
69 /* 98 /*
70 * We need to retry, but first give the freezing tasks some 99 * We need to retry, but first give the freezing tasks some
71 * time to enter the refrigerator. 100 * time to enter the regrigerator.
72 */ 101 */
73 msleep(10); 102 msleep(10);
74 } 103 }
@@ -79,22 +108,35 @@ static int try_to_freeze_tasks(bool user_only)
79 elapsed_csecs = elapsed_csecs64; 108 elapsed_csecs = elapsed_csecs64;
80 109
81 if (todo) { 110 if (todo) {
82 printk("\n"); 111 /* This does not unfreeze processes that are already frozen
83 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 112 * (we have slightly ugly calling convention in that respect,
84 "(%d tasks refusing to freeze, wq_busy=%d):\n", 113 * and caller must call thaw_processes() if something fails),
85 wakeup ? "aborted" : "failed", 114 * but it cleans up leftover PF_FREEZE requests.
86 elapsed_csecs / 100, elapsed_csecs % 100, 115 */
87 todo - wq_busy, wq_busy); 116 if(wakeup) {
88 117 printk("\n");
89 if (!wakeup) { 118 printk(KERN_ERR "Freezing of %s aborted\n",
90 read_lock(&tasklist_lock); 119 sig_only ? "user space " : "tasks ");
91 do_each_thread(g, p) {
92 if (p != current && !freezer_should_skip(p)
93 && freezing(p) && !frozen(p))
94 sched_show_task(p);
95 } while_each_thread(g, p);
96 read_unlock(&tasklist_lock);
97 } 120 }
121 else {
122 printk("\n");
123 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
124 "(%d tasks refusing to freeze, wq_busy=%d):\n",
125 elapsed_csecs / 100, elapsed_csecs % 100,
126 todo - wq_busy, wq_busy);
127 }
128 thaw_workqueues();
129
130 read_lock(&tasklist_lock);
131 do_each_thread(g, p) {
132 task_lock(p);
133 if (freezing(p) && !freezer_should_skip(p) &&
134 elapsed_csecs > 100)
135 sched_show_task(p);
136 cancel_freezing(p);
137 task_unlock(p);
138 } while_each_thread(g, p);
139 read_unlock(&tasklist_lock);
98 } else { 140 } else {
99 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 141 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
100 elapsed_csecs % 100); 142 elapsed_csecs % 100);
@@ -104,106 +146,61 @@ static int try_to_freeze_tasks(bool user_only)
104} 146}
105 147
106/** 148/**
107 * freeze_processes - Signal user space processes to enter the refrigerator. 149 * freeze_processes - tell processes to enter the refrigerator
108 *
109 * On success, returns 0. On failure, -errno and system is fully thawed.
110 */ 150 */
111int freeze_processes(void) 151int freeze_processes(void)
112{ 152{
113 int error; 153 int error;
114 154
115 error = __usermodehelper_disable(UMH_FREEZING);
116 if (error)
117 return error;
118
119 if (!pm_freezing)
120 atomic_inc(&system_freezing_cnt);
121
122 printk("Freezing user space processes ... "); 155 printk("Freezing user space processes ... ");
123 pm_freezing = true;
124 error = try_to_freeze_tasks(true); 156 error = try_to_freeze_tasks(true);
125 if (!error) {
126 printk("done.");
127 __usermodehelper_set_disable_depth(UMH_DISABLED);
128 oom_killer_disable();
129 }
130 printk("\n");
131 BUG_ON(in_atomic());
132
133 if (error) 157 if (error)
134 thaw_processes(); 158 goto Exit;
135 return error; 159 printk("done.\n");
136}
137
138/**
139 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
140 *
141 * On success, returns 0. On failure, -errno and only the kernel threads are
142 * thawed, so as to give a chance to the caller to do additional cleanups
143 * (if any) before thawing the userspace tasks. So, it is the responsibility
144 * of the caller to thaw the userspace tasks, when the time is right.
145 */
146int freeze_kernel_threads(void)
147{
148 int error;
149 160
150 printk("Freezing remaining freezable tasks ... "); 161 printk("Freezing remaining freezable tasks ... ");
151 pm_nosig_freezing = true;
152 error = try_to_freeze_tasks(false); 162 error = try_to_freeze_tasks(false);
153 if (!error) 163 if (error)
154 printk("done."); 164 goto Exit;
165 printk("done.");
155 166
156 printk("\n"); 167 oom_killer_disable();
168 Exit:
157 BUG_ON(in_atomic()); 169 BUG_ON(in_atomic());
170 printk("\n");
158 171
159 if (error)
160 thaw_kernel_threads();
161 return error; 172 return error;
162} 173}
163 174
164void thaw_processes(void) 175static void thaw_tasks(bool nosig_only)
165{ 176{
166 struct task_struct *g, *p; 177 struct task_struct *g, *p;
167 178
168 if (pm_freezing) 179 read_lock(&tasklist_lock);
169 atomic_dec(&system_freezing_cnt); 180 do_each_thread(g, p) {
170 pm_freezing = false; 181 if (!freezable(p))
171 pm_nosig_freezing = false; 182 continue;
172
173 oom_killer_enable();
174 183
175 printk("Restarting tasks ... "); 184 if (nosig_only && should_send_signal(p))
185 continue;
176 186
177 thaw_workqueues(); 187 if (cgroup_freezing_or_frozen(p))
188 continue;
178 189
179 read_lock(&tasklist_lock); 190 thaw_process(p);
180 do_each_thread(g, p) {
181 __thaw_task(p);
182 } while_each_thread(g, p); 191 } while_each_thread(g, p);
183 read_unlock(&tasklist_lock); 192 read_unlock(&tasklist_lock);
184
185 usermodehelper_enable();
186
187 schedule();
188 printk("done.\n");
189} 193}
190 194
191void thaw_kernel_threads(void) 195void thaw_processes(void)
192{ 196{
193 struct task_struct *g, *p; 197 oom_killer_enable();
194
195 pm_nosig_freezing = false;
196 printk("Restarting kernel threads ... ");
197 198
199 printk("Restarting tasks ... ");
198 thaw_workqueues(); 200 thaw_workqueues();
199 201 thaw_tasks(true);
200 read_lock(&tasklist_lock); 202 thaw_tasks(false);
201 do_each_thread(g, p) {
202 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
203 __thaw_task(p);
204 } while_each_thread(g, p);
205 read_unlock(&tasklist_lock);
206
207 schedule(); 203 schedule();
208 printk("done.\n"); 204 printk("done.\n");
209} 205}
206
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
deleted file mode 100644
index 9322ff7eaad..00000000000
--- a/kernel/power/qos.c
+++ /dev/null
@@ -1,602 +0,0 @@
1/*
2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of:
4 *
5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes
7 *
8 * This QoS design is best effort based. Dependents register their QoS needs.
9 * Watchers register to keep track of the current QoS needs of the system.
10 *
11 * There are 3 basic classes of QoS parameter: latency, timeout, throughput
12 * each have defined units:
13 * latency: usec
14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec)
16 *
17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 *
19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode
23 * request is removed and a new qos target is computed. This way when the
24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 *
27 * Mark Gross <mgross@linux.intel.com>
28 */
29
30/*#define DEBUG*/
31
32#include <linux/pm_qos.h>
33#include <linux/sched.h>
34#include <linux/spinlock.h>
35#include <linux/slab.h>
36#include <linux/time.h>
37#include <linux/fs.h>
38#include <linux/device.h>
39#include <linux/miscdevice.h>
40#include <linux/string.h>
41#include <linux/platform_device.h>
42#include <linux/init.h>
43#include <linux/kernel.h>
44
45#include <linux/uaccess.h>
46#include <linux/export.h>
47
48/*
49 * locking rule: all changes to constraints or notifiers lists
50 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
51 * held, taken with _irqsave. One lock to rule them all
52 */
53struct pm_qos_object {
54 struct pm_qos_constraints *constraints;
55 struct miscdevice pm_qos_power_miscdev;
56 char *name;
57};
58
59static DEFINE_SPINLOCK(pm_qos_lock);
60
61static struct pm_qos_object null_pm_qos;
62
63static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
64static struct pm_qos_constraints cpu_dma_constraints = {
65 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
66 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
67 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
68 .type = PM_QOS_MIN,
69 .notifiers = &cpu_dma_lat_notifier,
70};
71static struct pm_qos_object cpu_dma_pm_qos = {
72 .constraints = &cpu_dma_constraints,
73 .name = "cpu_dma_latency",
74};
75
76static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
77static struct pm_qos_constraints network_lat_constraints = {
78 .list = PLIST_HEAD_INIT(network_lat_constraints.list),
79 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
80 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
81 .type = PM_QOS_MIN,
82 .notifiers = &network_lat_notifier,
83};
84static struct pm_qos_object network_lat_pm_qos = {
85 .constraints = &network_lat_constraints,
86 .name = "network_latency",
87};
88
89
90static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
91static struct pm_qos_constraints network_tput_constraints = {
92 .list = PLIST_HEAD_INIT(network_tput_constraints.list),
93 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
94 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
95 .type = PM_QOS_MAX,
96 .notifiers = &network_throughput_notifier,
97};
98static struct pm_qos_object network_throughput_pm_qos = {
99 .constraints = &network_tput_constraints,
100 .name = "network_throughput",
101};
102
103
104static struct pm_qos_object *pm_qos_array[] = {
105 &null_pm_qos,
106 &cpu_dma_pm_qos,
107 &network_lat_pm_qos,
108 &network_throughput_pm_qos
109};
110
111static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
112 size_t count, loff_t *f_pos);
113static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
114 size_t count, loff_t *f_pos);
115static int pm_qos_power_open(struct inode *inode, struct file *filp);
116static int pm_qos_power_release(struct inode *inode, struct file *filp);
117
118static const struct file_operations pm_qos_power_fops = {
119 .write = pm_qos_power_write,
120 .read = pm_qos_power_read,
121 .open = pm_qos_power_open,
122 .release = pm_qos_power_release,
123 .llseek = noop_llseek,
124};
125
126/* unlocked internal variant */
127static inline int pm_qos_get_value(struct pm_qos_constraints *c)
128{
129 if (plist_head_empty(&c->list))
130 return c->default_value;
131
132 switch (c->type) {
133 case PM_QOS_MIN:
134 return plist_first(&c->list)->prio;
135
136 case PM_QOS_MAX:
137 return plist_last(&c->list)->prio;
138
139 default:
140 /* runtime check for not using enum */
141 BUG();
142 return PM_QOS_DEFAULT_VALUE;
143 }
144}
145
146s32 pm_qos_read_value(struct pm_qos_constraints *c)
147{
148 return c->target_value;
149}
150
151static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
152{
153 c->target_value = value;
154}
155
156/**
157 * pm_qos_update_target - manages the constraints list and calls the notifiers
158 * if needed
159 * @c: constraints data struct
160 * @node: request to add to the list, to update or to remove
161 * @action: action to take on the constraints list
162 * @value: value of the request to add or update
163 *
164 * This function returns 1 if the aggregated constraint value has changed, 0
165 * otherwise.
166 */
167int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
168 enum pm_qos_req_action action, int value)
169{
170 unsigned long flags;
171 int prev_value, curr_value, new_value;
172
173 spin_lock_irqsave(&pm_qos_lock, flags);
174 prev_value = pm_qos_get_value(c);
175 if (value == PM_QOS_DEFAULT_VALUE)
176 new_value = c->default_value;
177 else
178 new_value = value;
179
180 switch (action) {
181 case PM_QOS_REMOVE_REQ:
182 plist_del(node, &c->list);
183 break;
184 case PM_QOS_UPDATE_REQ:
185 /*
186 * to change the list, we atomically remove, reinit
187 * with new value and add, then see if the extremal
188 * changed
189 */
190 plist_del(node, &c->list);
191 case PM_QOS_ADD_REQ:
192 plist_node_init(node, new_value);
193 plist_add(node, &c->list);
194 break;
195 default:
196 /* no action */
197 ;
198 }
199
200 curr_value = pm_qos_get_value(c);
201 pm_qos_set_value(c, curr_value);
202
203 spin_unlock_irqrestore(&pm_qos_lock, flags);
204
205 if (prev_value != curr_value) {
206 blocking_notifier_call_chain(c->notifiers,
207 (unsigned long)curr_value,
208 NULL);
209 return 1;
210 } else {
211 return 0;
212 }
213}
214
215/**
216 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
217 * @pqf: Device PM QoS flags set to remove the request from.
218 * @req: Request to remove from the set.
219 */
220static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
221 struct pm_qos_flags_request *req)
222{
223 s32 val = 0;
224
225 list_del(&req->node);
226 list_for_each_entry(req, &pqf->list, node)
227 val |= req->flags;
228
229 pqf->effective_flags = val;
230}
231
232/**
233 * pm_qos_update_flags - Update a set of PM QoS flags.
234 * @pqf: Set of flags to update.
235 * @req: Request to add to the set, to modify, or to remove from the set.
236 * @action: Action to take on the set.
237 * @val: Value of the request to add or modify.
238 *
239 * Update the given set of PM QoS flags and call notifiers if the aggregate
240 * value has changed. Returns 1 if the aggregate constraint value has changed,
241 * 0 otherwise.
242 */
243bool pm_qos_update_flags(struct pm_qos_flags *pqf,
244 struct pm_qos_flags_request *req,
245 enum pm_qos_req_action action, s32 val)
246{
247 unsigned long irqflags;
248 s32 prev_value, curr_value;
249
250 spin_lock_irqsave(&pm_qos_lock, irqflags);
251
252 prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
253
254 switch (action) {
255 case PM_QOS_REMOVE_REQ:
256 pm_qos_flags_remove_req(pqf, req);
257 break;
258 case PM_QOS_UPDATE_REQ:
259 pm_qos_flags_remove_req(pqf, req);
260 case PM_QOS_ADD_REQ:
261 req->flags = val;
262 INIT_LIST_HEAD(&req->node);
263 list_add_tail(&req->node, &pqf->list);
264 pqf->effective_flags |= val;
265 break;
266 default:
267 /* no action */
268 ;
269 }
270
271 curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
272
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274
275 return prev_value != curr_value;
276}
277
278/**
279 * pm_qos_request - returns current system wide qos expectation
280 * @pm_qos_class: identification of which qos value is requested
281 *
282 * This function returns the current target value.
283 */
284int pm_qos_request(int pm_qos_class)
285{
286 return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
287}
288EXPORT_SYMBOL_GPL(pm_qos_request);
289
290int pm_qos_request_active(struct pm_qos_request *req)
291{
292 return req->pm_qos_class != 0;
293}
294EXPORT_SYMBOL_GPL(pm_qos_request_active);
295
296/**
297 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
298 * @work: work struct for the delayed work (timeout)
299 *
300 * This cancels the timeout request by falling back to the default at timeout.
301 */
302static void pm_qos_work_fn(struct work_struct *work)
303{
304 struct pm_qos_request *req = container_of(to_delayed_work(work),
305 struct pm_qos_request,
306 work);
307
308 pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
309}
310
311/**
312 * pm_qos_add_request - inserts new qos request into the list
313 * @req: pointer to a preallocated handle
314 * @pm_qos_class: identifies which list of qos request to use
315 * @value: defines the qos request
316 *
317 * This function inserts a new entry in the pm_qos_class list of requested qos
318 * performance characteristics. It recomputes the aggregate QoS expectations
319 * for the pm_qos_class of parameters and initializes the pm_qos_request
320 * handle. Caller needs to save this handle for later use in updates and
321 * removal.
322 */
323
324void pm_qos_add_request(struct pm_qos_request *req,
325 int pm_qos_class, s32 value)
326{
327 if (!req) /*guard against callers passing in null */
328 return;
329
330 if (pm_qos_request_active(req)) {
331 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
332 return;
333 }
334 req->pm_qos_class = pm_qos_class;
335 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
336 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
337 &req->node, PM_QOS_ADD_REQ, value);
338}
339EXPORT_SYMBOL_GPL(pm_qos_add_request);
340
341/**
342 * pm_qos_update_request - modifies an existing qos request
343 * @req : handle to list element holding a pm_qos request to use
344 * @value: defines the qos request
345 *
346 * Updates an existing qos request for the pm_qos_class of parameters along
347 * with updating the target pm_qos_class value.
348 *
349 * Attempts are made to make this code callable on hot code paths.
350 */
351void pm_qos_update_request(struct pm_qos_request *req,
352 s32 new_value)
353{
354 if (!req) /*guard against callers passing in null */
355 return;
356
357 if (!pm_qos_request_active(req)) {
358 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
359 return;
360 }
361
362 if (delayed_work_pending(&req->work))
363 cancel_delayed_work_sync(&req->work);
364
365 if (new_value != req->node.prio)
366 pm_qos_update_target(
367 pm_qos_array[req->pm_qos_class]->constraints,
368 &req->node, PM_QOS_UPDATE_REQ, new_value);
369}
370EXPORT_SYMBOL_GPL(pm_qos_update_request);
371
372/**
373 * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
374 * @req : handle to list element holding a pm_qos request to use
375 * @new_value: defines the temporal qos request
376 * @timeout_us: the effective duration of this qos request in usecs.
377 *
378 * After timeout_us, this qos request is cancelled automatically.
379 */
380void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
381 unsigned long timeout_us)
382{
383 if (!req)
384 return;
385 if (WARN(!pm_qos_request_active(req),
386 "%s called for unknown object.", __func__))
387 return;
388
389 if (delayed_work_pending(&req->work))
390 cancel_delayed_work_sync(&req->work);
391
392 if (new_value != req->node.prio)
393 pm_qos_update_target(
394 pm_qos_array[req->pm_qos_class]->constraints,
395 &req->node, PM_QOS_UPDATE_REQ, new_value);
396
397 schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
398}
399
400/**
401 * pm_qos_remove_request - modifies an existing qos request
402 * @req: handle to request list element
403 *
404 * Will remove pm qos request from the list of constraints and
405 * recompute the current target value for the pm_qos_class. Call this
406 * on slow code paths.
407 */
408void pm_qos_remove_request(struct pm_qos_request *req)
409{
410 if (!req) /*guard against callers passing in null */
411 return;
412 /* silent return to keep pcm code cleaner */
413
414 if (!pm_qos_request_active(req)) {
415 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
416 return;
417 }
418
419 if (delayed_work_pending(&req->work))
420 cancel_delayed_work_sync(&req->work);
421
422 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
423 &req->node, PM_QOS_REMOVE_REQ,
424 PM_QOS_DEFAULT_VALUE);
425 memset(req, 0, sizeof(*req));
426}
427EXPORT_SYMBOL_GPL(pm_qos_remove_request);
428
429/**
430 * pm_qos_add_notifier - sets notification entry for changes to target value
431 * @pm_qos_class: identifies which qos target changes should be notified.
432 * @notifier: notifier block managed by caller.
433 *
434 * will register the notifier into a notification chain that gets called
435 * upon changes to the pm_qos_class target value.
436 */
437int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
438{
439 int retval;
440
441 retval = blocking_notifier_chain_register(
442 pm_qos_array[pm_qos_class]->constraints->notifiers,
443 notifier);
444
445 return retval;
446}
447EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
448
449/**
450 * pm_qos_remove_notifier - deletes notification entry from chain.
451 * @pm_qos_class: identifies which qos target changes are notified.
452 * @notifier: notifier block to be removed.
453 *
454 * will remove the notifier from the notification chain that gets called
455 * upon changes to the pm_qos_class target value.
456 */
457int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
458{
459 int retval;
460
461 retval = blocking_notifier_chain_unregister(
462 pm_qos_array[pm_qos_class]->constraints->notifiers,
463 notifier);
464
465 return retval;
466}
467EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
468
469/* User space interface to PM QoS classes via misc devices */
470static int register_pm_qos_misc(struct pm_qos_object *qos)
471{
472 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
473 qos->pm_qos_power_miscdev.name = qos->name;
474 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
475
476 return misc_register(&qos->pm_qos_power_miscdev);
477}
478
479static int find_pm_qos_object_by_minor(int minor)
480{
481 int pm_qos_class;
482
483 for (pm_qos_class = 0;
484 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
485 if (minor ==
486 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
487 return pm_qos_class;
488 }
489 return -1;
490}
491
492static int pm_qos_power_open(struct inode *inode, struct file *filp)
493{
494 long pm_qos_class;
495
496 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
497 if (pm_qos_class >= 0) {
498 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
499 if (!req)
500 return -ENOMEM;
501
502 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
503 filp->private_data = req;
504
505 return 0;
506 }
507 return -EPERM;
508}
509
510static int pm_qos_power_release(struct inode *inode, struct file *filp)
511{
512 struct pm_qos_request *req;
513
514 req = filp->private_data;
515 pm_qos_remove_request(req);
516 kfree(req);
517
518 return 0;
519}
520
521
522static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
523 size_t count, loff_t *f_pos)
524{
525 s32 value;
526 unsigned long flags;
527 struct pm_qos_request *req = filp->private_data;
528
529 if (!req)
530 return -EINVAL;
531 if (!pm_qos_request_active(req))
532 return -EINVAL;
533
534 spin_lock_irqsave(&pm_qos_lock, flags);
535 value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
536 spin_unlock_irqrestore(&pm_qos_lock, flags);
537
538 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
539}
540
541static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
542 size_t count, loff_t *f_pos)
543{
544 s32 value;
545 struct pm_qos_request *req;
546
547 if (count == sizeof(s32)) {
548 if (copy_from_user(&value, buf, sizeof(s32)))
549 return -EFAULT;
550 } else if (count <= 11) { /* ASCII perhaps? */
551 char ascii_value[11];
552 unsigned long int ulval;
553 int ret;
554
555 if (copy_from_user(ascii_value, buf, count))
556 return -EFAULT;
557
558 if (count > 10) {
559 if (ascii_value[10] == '\n')
560 ascii_value[10] = '\0';
561 else
562 return -EINVAL;
563 } else {
564 ascii_value[count] = '\0';
565 }
566 ret = kstrtoul(ascii_value, 16, &ulval);
567 if (ret) {
568 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
569 return -EINVAL;
570 }
571 value = (s32)lower_32_bits(ulval);
572 } else {
573 return -EINVAL;
574 }
575
576 req = filp->private_data;
577 pm_qos_update_request(req, value);
578
579 return count;
580}
581
582
583static int __init pm_qos_power_init(void)
584{
585 int ret = 0;
586 int i;
587
588 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
589
590 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
591 ret = register_pm_qos_misc(pm_qos_array[i]);
592 if (ret < 0) {
593 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
594 pm_qos_array[i]->name);
595 return ret;
596 }
597 }
598
599 return ret;
600}
601
602late_initcall(pm_qos_power_init);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807..06efa54f93d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,10 +711,9 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
711 list_for_each_entry(region, &nosave_regions, list) { 711 list_for_each_entry(region, &nosave_regions, list) {
712 unsigned long pfn; 712 unsigned long pfn;
713 713
714 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", 714 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
715 (unsigned long long) region->start_pfn << PAGE_SHIFT, 715 region->start_pfn << PAGE_SHIFT,
716 ((unsigned long long) region->end_pfn << PAGE_SHIFT) 716 region->end_pfn << PAGE_SHIFT);
717 - 1);
718 717
719 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 718 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
720 if (pfn_valid(pfn)) { 719 if (pfn_valid(pfn)) {
@@ -813,8 +812,7 @@ unsigned int snapshot_additional_pages(struct zone *zone)
813 unsigned int res; 812 unsigned int res;
814 813
815 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
816 res += DIV_ROUND_UP(res * sizeof(struct bm_block), 815 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
817 LINKED_PAGE_DATA_SIZE);
818 return 2 * res; 816 return 2 * res;
819} 817}
820 818
@@ -860,9 +858,6 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
860 PageReserved(page)) 858 PageReserved(page))
861 return NULL; 859 return NULL;
862 860
863 if (page_is_guard(page))
864 return NULL;
865
866 return page; 861 return page;
867} 862}
868 863
@@ -925,9 +920,6 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
925 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 920 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
926 return NULL; 921 return NULL;
927 922
928 if (page_is_guard(page))
929 return NULL;
930
931 return page; 923 return page;
932} 924}
933 925
@@ -1001,20 +993,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1001 s_page = pfn_to_page(src_pfn); 993 s_page = pfn_to_page(src_pfn);
1002 d_page = pfn_to_page(dst_pfn); 994 d_page = pfn_to_page(dst_pfn);
1003 if (PageHighMem(s_page)) { 995 if (PageHighMem(s_page)) {
1004 src = kmap_atomic(s_page); 996 src = kmap_atomic(s_page, KM_USER0);
1005 dst = kmap_atomic(d_page); 997 dst = kmap_atomic(d_page, KM_USER1);
1006 do_copy_page(dst, src); 998 do_copy_page(dst, src);
1007 kunmap_atomic(dst); 999 kunmap_atomic(dst, KM_USER1);
1008 kunmap_atomic(src); 1000 kunmap_atomic(src, KM_USER0);
1009 } else { 1001 } else {
1010 if (PageHighMem(d_page)) { 1002 if (PageHighMem(d_page)) {
1011 /* Page pointed to by src may contain some kernel 1003 /* Page pointed to by src may contain some kernel
1012 * data modified by kmap_atomic() 1004 * data modified by kmap_atomic()
1013 */ 1005 */
1014 safe_copy_page(buffer, s_page); 1006 safe_copy_page(buffer, s_page);
1015 dst = kmap_atomic(d_page); 1007 dst = kmap_atomic(d_page, KM_USER0);
1016 copy_page(dst, buffer); 1008 copy_page(dst, buffer);
1017 kunmap_atomic(dst); 1009 kunmap_atomic(dst, KM_USER0);
1018 } else { 1010 } else {
1019 safe_copy_page(page_address(d_page), s_page); 1011 safe_copy_page(page_address(d_page), s_page);
1020 } 1012 }
@@ -1347,9 +1339,6 @@ int hibernate_preallocate_memory(void)
1347 count += highmem; 1339 count += highmem;
1348 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1349 1341
1350 /* Add number of pages required for page keys (s390 only). */
1351 size += page_key_additional_pages(saveable);
1352
1353 /* Compute the maximum number of saveable pages to leave in memory. */ 1342 /* Compute the maximum number of saveable pages to leave in memory. */
1354 max_size = (count - (size + PAGES_FOR_IO)) / 2 1343 max_size = (count - (size + PAGES_FOR_IO)) / 2
1355 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); 1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1673,8 +1662,6 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1673 buf[j] = memory_bm_next_pfn(bm); 1662 buf[j] = memory_bm_next_pfn(bm);
1674 if (unlikely(buf[j] == BM_END_OF_MAP)) 1663 if (unlikely(buf[j] == BM_END_OF_MAP))
1675 break; 1664 break;
1676 /* Save page key for data page (s390 only). */
1677 page_key_read(buf + j);
1678 } 1665 }
1679} 1666}
1680 1667
@@ -1729,9 +1716,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
1729 */ 1716 */
1730 void *kaddr; 1717 void *kaddr;
1731 1718
1732 kaddr = kmap_atomic(page); 1719 kaddr = kmap_atomic(page, KM_USER0);
1733 copy_page(buffer, kaddr); 1720 copy_page(buffer, kaddr);
1734 kunmap_atomic(kaddr); 1721 kunmap_atomic(kaddr, KM_USER0);
1735 handle->buffer = buffer; 1722 handle->buffer = buffer;
1736 } else { 1723 } else {
1737 handle->buffer = page_address(page); 1724 handle->buffer = page_address(page);
@@ -1834,9 +1821,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1834 if (unlikely(buf[j] == BM_END_OF_MAP)) 1821 if (unlikely(buf[j] == BM_END_OF_MAP))
1835 break; 1822 break;
1836 1823
1837 /* Extract and buffer page key for data page (s390 only). */
1838 page_key_memorize(buf + j);
1839
1840 if (memory_bm_pfn_present(bm, buf[j])) 1824 if (memory_bm_pfn_present(bm, buf[j]))
1841 memory_bm_set_bit(bm, buf[j]); 1825 memory_bm_set_bit(bm, buf[j]);
1842 else 1826 else
@@ -2015,9 +1999,9 @@ static void copy_last_highmem_page(void)
2015 if (last_highmem_page) { 1999 if (last_highmem_page) {
2016 void *dst; 2000 void *dst;
2017 2001
2018 dst = kmap_atomic(last_highmem_page); 2002 dst = kmap_atomic(last_highmem_page, KM_USER0);
2019 copy_page(dst, buffer); 2003 copy_page(dst, buffer);
2020 kunmap_atomic(dst); 2004 kunmap_atomic(dst, KM_USER0);
2021 last_highmem_page = NULL; 2005 last_highmem_page = NULL;
2022 } 2006 }
2023} 2007}
@@ -2239,11 +2223,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
2239 if (error) 2223 if (error)
2240 return error; 2224 return error;
2241 2225
2242 /* Allocate buffer for page keys. */
2243 error = page_key_alloc(nr_copy_pages);
2244 if (error)
2245 return error;
2246
2247 } else if (handle->cur <= nr_meta_pages + 1) { 2226 } else if (handle->cur <= nr_meta_pages + 1) {
2248 error = unpack_orig_pfns(buffer, &copy_bm); 2227 error = unpack_orig_pfns(buffer, &copy_bm);
2249 if (error) 2228 if (error)
@@ -2264,8 +2243,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
2264 } 2243 }
2265 } else { 2244 } else {
2266 copy_last_highmem_page(); 2245 copy_last_highmem_page();
2267 /* Restore page key for data page (s390 only). */
2268 page_key_write(handle->buffer);
2269 handle->buffer = get_buffer(&orig_bm, &ca); 2246 handle->buffer = get_buffer(&orig_bm, &ca);
2270 if (IS_ERR(handle->buffer)) 2247 if (IS_ERR(handle->buffer))
2271 return PTR_ERR(handle->buffer); 2248 return PTR_ERR(handle->buffer);
@@ -2287,9 +2264,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
2287void snapshot_write_finalize(struct snapshot_handle *handle) 2264void snapshot_write_finalize(struct snapshot_handle *handle)
2288{ 2265{
2289 copy_last_highmem_page(); 2266 copy_last_highmem_page();
2290 /* Restore page key for data page (s390 only). */
2291 page_key_write(handle->buffer);
2292 page_key_free();
2293 /* Free only if we have loaded the image entirely */ 2267 /* Free only if we have loaded the image entirely */
2294 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { 2268 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2295 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2269 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
@@ -2310,13 +2284,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2310{ 2284{
2311 void *kaddr1, *kaddr2; 2285 void *kaddr1, *kaddr2;
2312 2286
2313 kaddr1 = kmap_atomic(p1); 2287 kaddr1 = kmap_atomic(p1, KM_USER0);
2314 kaddr2 = kmap_atomic(p2); 2288 kaddr2 = kmap_atomic(p2, KM_USER1);
2315 copy_page(buf, kaddr1); 2289 copy_page(buf, kaddr1);
2316 copy_page(kaddr1, kaddr2); 2290 copy_page(kaddr1, kaddr2);
2317 copy_page(kaddr2, buf); 2291 copy_page(kaddr2, buf);
2318 kunmap_atomic(kaddr2); 2292 kunmap_atomic(kaddr2, KM_USER1);
2319 kunmap_atomic(kaddr1); 2293 kunmap_atomic(kaddr1, KM_USER0);
2320} 2294}
2321 2295
2322/** 2296/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27d..a6f6e3114a2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -21,15 +21,16 @@
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/export.h>
25#include <linux/suspend.h> 24#include <linux/suspend.h>
26#include <linux/syscore_ops.h> 25#include <linux/syscore_ops.h>
27#include <linux/ftrace.h>
28#include <trace/events/power.h> 26#include <trace/events/power.h>
29 27
30#include "power.h" 28#include "power.h"
31 29
32const char *const pm_states[PM_SUSPEND_MAX] = { 30const char *const pm_states[PM_SUSPEND_MAX] = {
31#ifdef CONFIG_EARLYSUSPEND
32 [PM_SUSPEND_ON] = "on",
33#endif
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
@@ -37,14 +38,14 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
39/** 40/**
40 * suspend_set_ops - Set the global suspend method table. 41 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 42 * @ops: Pointer to ops structure.
42 */ 43 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 44void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 45{
45 lock_system_sleep(); 46 mutex_lock(&pm_mutex);
46 suspend_ops = ops; 47 suspend_ops = ops;
47 unlock_system_sleep(); 48 mutex_unlock(&pm_mutex);
48} 49}
49EXPORT_SYMBOL_GPL(suspend_set_ops); 50EXPORT_SYMBOL_GPL(suspend_set_ops);
50 51
@@ -58,11 +59,11 @@ bool valid_state(suspend_state_t state)
58} 59}
59 60
60/** 61/**
61 * suspend_valid_only_mem - Generic memory-only valid callback. 62 * suspend_valid_only_mem - generic memory-only valid callback
62 * 63 *
63 * Platform drivers that implement mem suspend only and only need to check for 64 * Platform drivers that implement mem suspend only and only need
64 * that in their .valid() callback can use this instead of rolling their own 65 * to check for that in their .valid callback can use this instead
65 * .valid() callback. 66 * of rolling their own .valid callback.
66 */ 67 */
67int suspend_valid_only_mem(suspend_state_t state) 68int suspend_valid_only_mem(suspend_state_t state)
68{ 69{
@@ -83,11 +84,10 @@ static int suspend_test(int level)
83} 84}
84 85
85/** 86/**
86 * suspend_prepare - Prepare for entering system sleep state. 87 * suspend_prepare - Do prep work before entering low-power state.
87 * 88 *
88 * Common code run for every system sleep state that can be entered (except for 89 * This is common code that is called for each state that we're entering.
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 90 * Run suspend notifiers, allocate a console and stop all processes.
90 * freeze processes.
91 */ 91 */
92static int suspend_prepare(void) 92static int suspend_prepare(void)
93{ 93{
@@ -102,12 +102,16 @@ static int suspend_prepare(void)
102 if (error) 102 if (error)
103 goto Finish; 103 goto Finish;
104 104
105 error = usermodehelper_disable();
106 if (error)
107 goto Finish;
108
105 error = suspend_freeze_processes(); 109 error = suspend_freeze_processes();
106 if (!error) 110 if (!error)
107 return 0; 111 return 0;
108 112
109 suspend_stats.failed_freeze++; 113 suspend_thaw_processes();
110 dpm_save_failed_step(SUSPEND_FREEZE); 114 usermodehelper_enable();
111 Finish: 115 Finish:
112 pm_notifier_call_chain(PM_POST_SUSPEND); 116 pm_notifier_call_chain(PM_POST_SUSPEND);
113 pm_restore_console(); 117 pm_restore_console();
@@ -127,9 +131,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
127} 131}
128 132
129/** 133/**
130 * suspend_enter - Make the system enter the given sleep state. 134 * suspend_enter - enter the desired system sleep state.
131 * @state: System sleep state to enter. 135 * @state: State to enter
132 * @wakeup: Returns information that the sleep state should not be re-entered. 136 * @wakeup: Returns information that suspend should not be entered again.
133 * 137 *
134 * This function should be called after devices have been suspended. 138 * This function should be called after devices have been suspended.
135 */ 139 */
@@ -143,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
143 goto Platform_finish; 147 goto Platform_finish;
144 } 148 }
145 149
146 error = dpm_suspend_end(PMSG_SUSPEND); 150 error = dpm_suspend_noirq(PMSG_SUSPEND);
147 if (error) { 151 if (error) {
148 printk(KERN_ERR "PM: Some devices failed to power down\n"); 152 printk(KERN_ERR "PM: Some devices failed to power down\n");
149 goto Platform_finish; 153 goto Platform_finish;
@@ -185,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
185 if (suspend_ops->wake) 189 if (suspend_ops->wake)
186 suspend_ops->wake(); 190 suspend_ops->wake();
187 191
188 dpm_resume_start(PMSG_RESUME); 192 dpm_resume_noirq(PMSG_RESUME);
189 193
190 Platform_finish: 194 Platform_finish:
191 if (suspend_ops->finish) 195 if (suspend_ops->finish)
@@ -195,8 +199,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
195} 199}
196 200
197/** 201/**
198 * suspend_devices_and_enter - Suspend devices and enter system sleep state. 202 * suspend_devices_and_enter - suspend devices and enter the desired system
199 * @state: System sleep state to enter. 203 * sleep state.
204 * @state: state to enter
200 */ 205 */
201int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
202{ 207{
@@ -213,7 +218,6 @@ int suspend_devices_and_enter(suspend_state_t state)
213 goto Close; 218 goto Close;
214 } 219 }
215 suspend_console(); 220 suspend_console();
216 ftrace_stop();
217 suspend_test_start(); 221 suspend_test_start();
218 error = dpm_suspend_start(PMSG_SUSPEND); 222 error = dpm_suspend_start(PMSG_SUSPEND);
219 if (error) { 223 if (error) {
@@ -233,7 +237,6 @@ int suspend_devices_and_enter(suspend_state_t state)
233 suspend_test_start(); 237 suspend_test_start();
234 dpm_resume_end(PMSG_RESUME); 238 dpm_resume_end(PMSG_RESUME);
235 suspend_test_finish("resume devices"); 239 suspend_test_finish("resume devices");
236 ftrace_start();
237 resume_console(); 240 resume_console();
238 Close: 241 Close:
239 if (suspend_ops->end) 242 if (suspend_ops->end)
@@ -248,27 +251,30 @@ int suspend_devices_and_enter(suspend_state_t state)
248} 251}
249 252
250/** 253/**
251 * suspend_finish - Clean up before finishing the suspend sequence. 254 * suspend_finish - Do final work before exiting suspend sequence.
252 * 255 *
253 * Call platform code to clean up, restart processes, and free the console that 256 * Call platform code to clean up, restart processes, and free the
254 * we've allocated. This routine is not called for hibernation. 257 * console that we've allocated. This is not called for suspend-to-disk.
255 */ 258 */
256static void suspend_finish(void) 259static void suspend_finish(void)
257{ 260{
258 suspend_thaw_processes(); 261 suspend_thaw_processes();
262 usermodehelper_enable();
259 pm_notifier_call_chain(PM_POST_SUSPEND); 263 pm_notifier_call_chain(PM_POST_SUSPEND);
260 pm_restore_console(); 264 pm_restore_console();
261} 265}
262 266
263/** 267/**
264 * enter_state - Do common work needed to enter system sleep state. 268 * enter_state - Do common work of entering low-power state.
265 * @state: System sleep state to enter. 269 * @state: pm_state structure for state we're entering.
266 * 270 *
267 * Make sure that no one else is trying to put the system into a sleep state. 271 * Make sure we're the only ones trying to enter a sleep state. Fail
268 * Fail if that's not the case. Otherwise, prepare for system suspend, make the 272 * if someone has beat us to it, since we don't want anything weird to
269 * system enter the given sleep state and clean up after wakeup. 273 * happen when we wake up.
274 * Then, do the setup for suspend, enter the state, and cleaup (after
275 * we've woken up).
270 */ 276 */
271static int enter_state(suspend_state_t state) 277int enter_state(suspend_state_t state)
272{ 278{
273 int error; 279 int error;
274 280
@@ -304,26 +310,16 @@ static int enter_state(suspend_state_t state)
304} 310}
305 311
306/** 312/**
307 * pm_suspend - Externally visible function for suspending the system. 313 * pm_suspend - Externally visible function for suspending system.
308 * @state: System sleep state to enter. 314 * @state: Enumerated value of state to enter.
309 * 315 *
310 * Check if the value of @state represents one of the supported states, 316 * Determine whether or not value is within range, get state
311 * execute enter_state() and update system suspend statistics. 317 * structure, and enter (above).
312 */ 318 */
313int pm_suspend(suspend_state_t state) 319int pm_suspend(suspend_state_t state)
314{ 320{
315 int error; 321 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
316 322 return enter_state(state);
317 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) 323 return -EINVAL;
318 return -EINVAL;
319
320 error = enter_state(state);
321 if (error) {
322 suspend_stats.fail++;
323 dpm_save_failed_errno(error);
324 } else {
325 suspend_stats.success++;
326 }
327 return error;
328} 324}
329EXPORT_SYMBOL(pm_suspend); 325EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed20041..7c97c3a0eee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> 9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
10 * 10 *
11 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
12 * 12 *
@@ -18,6 +18,7 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/buffer_head.h>
21#include <linux/bio.h> 22#include <linux/bio.h>
22#include <linux/blkdev.h> 23#include <linux/blkdev.h>
23#include <linux/swap.h> 24#include <linux/swap.h>
@@ -26,10 +27,6 @@
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/lzo.h> 28#include <linux/lzo.h>
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/cpumask.h>
30#include <linux/atomic.h>
31#include <linux/kthread.h>
32#include <linux/crc32.h>
33 30
34#include "power.h" 31#include "power.h"
35 32
@@ -46,38 +43,17 @@
46 * allocated and populated one at a time, so we only need one memory 43 * allocated and populated one at a time, so we only need one memory
47 * page to set up the entire structure. 44 * page to set up the entire structure.
48 * 45 *
49 * During resume we pick up all swap_map_page structures into a list. 46 * During resume we also only need to use one swap_map_page structure
47 * at a time.
50 */ 48 */
51 49
52#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) 50#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
53 51
54/*
55 * Number of free pages that are not high.
56 */
57static inline unsigned long low_free_pages(void)
58{
59 return nr_free_pages() - nr_free_highpages();
60}
61
62/*
63 * Number of pages required to be kept free while writing the image. Always
64 * half of all available low pages before the writing starts.
65 */
66static inline unsigned long reqd_free_pages(void)
67{
68 return low_free_pages() / 2;
69}
70
71struct swap_map_page { 52struct swap_map_page {
72 sector_t entries[MAP_PAGE_ENTRIES]; 53 sector_t entries[MAP_PAGE_ENTRIES];
73 sector_t next_swap; 54 sector_t next_swap;
74}; 55};
75 56
76struct swap_map_page_list {
77 struct swap_map_page *map;
78 struct swap_map_page_list *next;
79};
80
81/** 57/**
82 * The swap_map_handle structure is used for handling swap in 58 * The swap_map_handle structure is used for handling swap in
83 * a file-alike way 59 * a file-alike way
@@ -85,18 +61,13 @@ struct swap_map_page_list {
85 61
86struct swap_map_handle { 62struct swap_map_handle {
87 struct swap_map_page *cur; 63 struct swap_map_page *cur;
88 struct swap_map_page_list *maps;
89 sector_t cur_swap; 64 sector_t cur_swap;
90 sector_t first_sector; 65 sector_t first_sector;
91 unsigned int k; 66 unsigned int k;
92 unsigned long reqd_free_pages;
93 u32 crc32;
94}; 67};
95 68
96struct swsusp_header { 69struct swsusp_header {
97 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - 70 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
98 sizeof(u32)];
99 u32 crc32;
100 sector_t image; 71 sector_t image;
101 unsigned int flags; /* Flags to pass to the "boot" kernel */ 72 unsigned int flags; /* Flags to pass to the "boot" kernel */
102 char orig_sig[10]; 73 char orig_sig[10];
@@ -126,7 +97,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
126 97
127 /* Figure out where to put the new node */ 98 /* Figure out where to put the new node */
128 while (*new) { 99 while (*new) {
129 ext = rb_entry(*new, struct swsusp_extent, node); 100 ext = container_of(*new, struct swsusp_extent, node);
130 parent = *new; 101 parent = *new;
131 if (swap_offset < ext->start) { 102 if (swap_offset < ext->start) {
132 /* Try to merge */ 103 /* Try to merge */
@@ -228,8 +199,6 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
228 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); 199 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
229 swsusp_header->image = handle->first_sector; 200 swsusp_header->image = handle->first_sector;
230 swsusp_header->flags = flags; 201 swsusp_header->flags = flags;
231 if (flags & SF_CRC32_MODE)
232 swsusp_header->crc32 = handle->crc32;
233 error = hib_bio_write_page(swsusp_resume_block, 202 error = hib_bio_write_page(swsusp_resume_block,
234 swsusp_header, NULL); 203 swsusp_header, NULL);
235 } else { 204 } else {
@@ -276,30 +245,18 @@ static int swsusp_swap_check(void)
276static int write_page(void *buf, sector_t offset, struct bio **bio_chain) 245static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
277{ 246{
278 void *src; 247 void *src;
279 int ret;
280 248
281 if (!offset) 249 if (!offset)
282 return -ENOSPC; 250 return -ENOSPC;
283 251
284 if (bio_chain) { 252 if (bio_chain) {
285 src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | 253 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
286 __GFP_NORETRY);
287 if (src) { 254 if (src) {
288 copy_page(src, buf); 255 copy_page(src, buf);
289 } else { 256 } else {
290 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ 257 WARN_ON_ONCE(1);
291 if (ret) 258 bio_chain = NULL; /* Go synchronous */
292 return ret; 259 src = buf;
293 src = (void *)__get_free_page(__GFP_WAIT |
294 __GFP_NOWARN |
295 __GFP_NORETRY);
296 if (src) {
297 copy_page(src, buf);
298 } else {
299 WARN_ON_ONCE(1);
300 bio_chain = NULL; /* Go synchronous */
301 src = buf;
302 }
303 } 260 }
304 } else { 261 } else {
305 src = buf; 262 src = buf;
@@ -336,7 +293,6 @@ static int get_swap_writer(struct swap_map_handle *handle)
336 goto err_rel; 293 goto err_rel;
337 } 294 }
338 handle->k = 0; 295 handle->k = 0;
339 handle->reqd_free_pages = reqd_free_pages();
340 handle->first_sector = handle->cur_swap; 296 handle->first_sector = handle->cur_swap;
341 return 0; 297 return 0;
342err_rel: 298err_rel:
@@ -360,27 +316,19 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
360 return error; 316 return error;
361 handle->cur->entries[handle->k++] = offset; 317 handle->cur->entries[handle->k++] = offset;
362 if (handle->k >= MAP_PAGE_ENTRIES) { 318 if (handle->k >= MAP_PAGE_ENTRIES) {
319 error = hib_wait_on_bio_chain(bio_chain);
320 if (error)
321 goto out;
363 offset = alloc_swapdev_block(root_swap); 322 offset = alloc_swapdev_block(root_swap);
364 if (!offset) 323 if (!offset)
365 return -ENOSPC; 324 return -ENOSPC;
366 handle->cur->next_swap = offset; 325 handle->cur->next_swap = offset;
367 error = write_page(handle->cur, handle->cur_swap, bio_chain); 326 error = write_page(handle->cur, handle->cur_swap, NULL);
368 if (error) 327 if (error)
369 goto out; 328 goto out;
370 clear_page(handle->cur); 329 clear_page(handle->cur);
371 handle->cur_swap = offset; 330 handle->cur_swap = offset;
372 handle->k = 0; 331 handle->k = 0;
373
374 if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
375 error = hib_wait_on_bio_chain(bio_chain);
376 if (error)
377 goto out;
378 /*
379 * Recalculate the number of required free pages, to
380 * make sure we never take more than half.
381 */
382 handle->reqd_free_pages = reqd_free_pages();
383 }
384 } 332 }
385 out: 333 out:
386 return error; 334 return error;
@@ -424,14 +372,6 @@ static int swap_writer_finish(struct swap_map_handle *handle,
424 LZO_HEADER, PAGE_SIZE) 372 LZO_HEADER, PAGE_SIZE)
425#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) 373#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
426 374
427/* Maximum number of threads for compression/decompression. */
428#define LZO_THREADS 3
429
430/* Minimum/maximum number of pages for read buffering. */
431#define LZO_MIN_RD_PAGES 1024
432#define LZO_MAX_RD_PAGES 8192
433
434
435/** 375/**
436 * save_image - save the suspend image data 376 * save_image - save the suspend image data
437 */ 377 */
@@ -448,9 +388,9 @@ static int save_image(struct swap_map_handle *handle,
448 struct timeval start; 388 struct timeval start;
449 struct timeval stop; 389 struct timeval stop;
450 390
451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", 391 printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
452 nr_to_write); 392 nr_to_write);
453 m = nr_to_write / 10; 393 m = nr_to_write / 100;
454 if (!m) 394 if (!m)
455 m = 1; 395 m = 1;
456 nr_pages = 0; 396 nr_pages = 0;
@@ -464,8 +404,7 @@ static int save_image(struct swap_map_handle *handle,
464 if (ret) 404 if (ret)
465 break; 405 break;
466 if (!(nr_pages % m)) 406 if (!(nr_pages % m))
467 printk(KERN_INFO "PM: Image saving progress: %3d%%\n", 407 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
468 nr_pages / m * 10);
469 nr_pages++; 408 nr_pages++;
470 } 409 }
471 err2 = hib_wait_on_bio_chain(&bio); 410 err2 = hib_wait_on_bio_chain(&bio);
@@ -473,97 +412,13 @@ static int save_image(struct swap_map_handle *handle,
473 if (!ret) 412 if (!ret)
474 ret = err2; 413 ret = err2;
475 if (!ret) 414 if (!ret)
476 printk(KERN_INFO "PM: Image saving done.\n"); 415 printk(KERN_CONT "\b\b\b\bdone\n");
416 else
417 printk(KERN_CONT "\n");
477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 418 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
478 return ret; 419 return ret;
479} 420}
480 421
481/**
482 * Structure used for CRC32.
483 */
484struct crc_data {
485 struct task_struct *thr; /* thread */
486 atomic_t ready; /* ready to start flag */
487 atomic_t stop; /* ready to stop flag */
488 unsigned run_threads; /* nr current threads */
489 wait_queue_head_t go; /* start crc update */
490 wait_queue_head_t done; /* crc update done */
491 u32 *crc32; /* points to handle's crc32 */
492 size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
493 unsigned char *unc[LZO_THREADS]; /* uncompressed data */
494};
495
496/**
497 * CRC32 update function that runs in its own thread.
498 */
499static int crc32_threadfn(void *data)
500{
501 struct crc_data *d = data;
502 unsigned i;
503
504 while (1) {
505 wait_event(d->go, atomic_read(&d->ready) ||
506 kthread_should_stop());
507 if (kthread_should_stop()) {
508 d->thr = NULL;
509 atomic_set(&d->stop, 1);
510 wake_up(&d->done);
511 break;
512 }
513 atomic_set(&d->ready, 0);
514
515 for (i = 0; i < d->run_threads; i++)
516 *d->crc32 = crc32_le(*d->crc32,
517 d->unc[i], *d->unc_len[i]);
518 atomic_set(&d->stop, 1);
519 wake_up(&d->done);
520 }
521 return 0;
522}
523/**
524 * Structure used for LZO data compression.
525 */
526struct cmp_data {
527 struct task_struct *thr; /* thread */
528 atomic_t ready; /* ready to start flag */
529 atomic_t stop; /* ready to stop flag */
530 int ret; /* return code */
531 wait_queue_head_t go; /* start compression */
532 wait_queue_head_t done; /* compression done */
533 size_t unc_len; /* uncompressed length */
534 size_t cmp_len; /* compressed length */
535 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
536 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
537 unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
538};
539
540/**
541 * Compression function that runs in its own thread.
542 */
543static int lzo_compress_threadfn(void *data)
544{
545 struct cmp_data *d = data;
546
547 while (1) {
548 wait_event(d->go, atomic_read(&d->ready) ||
549 kthread_should_stop());
550 if (kthread_should_stop()) {
551 d->thr = NULL;
552 d->ret = -1;
553 atomic_set(&d->stop, 1);
554 wake_up(&d->done);
555 break;
556 }
557 atomic_set(&d->ready, 0);
558
559 d->ret = lzo1x_1_compress(d->unc, d->unc_len,
560 d->cmp + LZO_HEADER, &d->cmp_len,
561 d->wrk);
562 atomic_set(&d->stop, 1);
563 wake_up(&d->done);
564 }
565 return 0;
566}
567 422
568/** 423/**
569 * save_image_lzo - Save the suspend image data compressed with LZO. 424 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -582,179 +437,98 @@ static int save_image_lzo(struct swap_map_handle *handle,
582 struct bio *bio; 437 struct bio *bio;
583 struct timeval start; 438 struct timeval start;
584 struct timeval stop; 439 struct timeval stop;
585 size_t off; 440 size_t off, unc_len, cmp_len;
586 unsigned thr, run_threads, nr_threads; 441 unsigned char *unc, *cmp, *wrk, *page;
587 unsigned char *page = NULL;
588 struct cmp_data *data = NULL;
589 struct crc_data *crc = NULL;
590
591 /*
592 * We'll limit the number of threads for compression to limit memory
593 * footprint.
594 */
595 nr_threads = num_online_cpus() - 1;
596 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
597 442
598 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 443 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
599 if (!page) { 444 if (!page) {
600 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 445 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
601 ret = -ENOMEM; 446 return -ENOMEM;
602 goto out_clean;
603 }
604
605 data = vmalloc(sizeof(*data) * nr_threads);
606 if (!data) {
607 printk(KERN_ERR "PM: Failed to allocate LZO data\n");
608 ret = -ENOMEM;
609 goto out_clean;
610 } 447 }
611 for (thr = 0; thr < nr_threads; thr++)
612 memset(&data[thr], 0, offsetof(struct cmp_data, go));
613 448
614 crc = kmalloc(sizeof(*crc), GFP_KERNEL); 449 wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
615 if (!crc) { 450 if (!wrk) {
616 printk(KERN_ERR "PM: Failed to allocate crc\n"); 451 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
617 ret = -ENOMEM; 452 free_page((unsigned long)page);
618 goto out_clean; 453 return -ENOMEM;
619 }
620 memset(crc, 0, offsetof(struct crc_data, go));
621
622 /*
623 * Start the compression threads.
624 */
625 for (thr = 0; thr < nr_threads; thr++) {
626 init_waitqueue_head(&data[thr].go);
627 init_waitqueue_head(&data[thr].done);
628
629 data[thr].thr = kthread_run(lzo_compress_threadfn,
630 &data[thr],
631 "image_compress/%u", thr);
632 if (IS_ERR(data[thr].thr)) {
633 data[thr].thr = NULL;
634 printk(KERN_ERR
635 "PM: Cannot start compression threads\n");
636 ret = -ENOMEM;
637 goto out_clean;
638 }
639 } 454 }
640 455
641 /* 456 unc = vmalloc(LZO_UNC_SIZE);
642 * Start the CRC32 thread. 457 if (!unc) {
643 */ 458 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
644 init_waitqueue_head(&crc->go); 459 vfree(wrk);
645 init_waitqueue_head(&crc->done); 460 free_page((unsigned long)page);
646 461 return -ENOMEM;
647 handle->crc32 = 0;
648 crc->crc32 = &handle->crc32;
649 for (thr = 0; thr < nr_threads; thr++) {
650 crc->unc[thr] = data[thr].unc;
651 crc->unc_len[thr] = &data[thr].unc_len;
652 } 462 }
653 463
654 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); 464 cmp = vmalloc(LZO_CMP_SIZE);
655 if (IS_ERR(crc->thr)) { 465 if (!cmp) {
656 crc->thr = NULL; 466 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
657 printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); 467 vfree(unc);
658 ret = -ENOMEM; 468 vfree(wrk);
659 goto out_clean; 469 free_page((unsigned long)page);
470 return -ENOMEM;
660 } 471 }
661 472
662 /*
663 * Adjust the number of required free pages after all allocations have
664 * been done. We don't want to run out of pages when writing.
665 */
666 handle->reqd_free_pages = reqd_free_pages();
667
668 printk(KERN_INFO 473 printk(KERN_INFO
669 "PM: Using %u thread(s) for compression.\n" 474 "PM: Compressing and saving image data (%u pages) ... ",
670 "PM: Compressing and saving image data (%u pages)...\n", 475 nr_to_write);
671 nr_threads, nr_to_write); 476 m = nr_to_write / 100;
672 m = nr_to_write / 10;
673 if (!m) 477 if (!m)
674 m = 1; 478 m = 1;
675 nr_pages = 0; 479 nr_pages = 0;
676 bio = NULL; 480 bio = NULL;
677 do_gettimeofday(&start); 481 do_gettimeofday(&start);
678 for (;;) { 482 for (;;) {
679 for (thr = 0; thr < nr_threads; thr++) { 483 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
680 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 484 ret = snapshot_read_next(snapshot);
681 ret = snapshot_read_next(snapshot); 485 if (ret < 0)
682 if (ret < 0) 486 goto out_finish;
683 goto out_finish; 487
684 488 if (!ret)
685 if (!ret)
686 break;
687
688 memcpy(data[thr].unc + off,
689 data_of(*snapshot), PAGE_SIZE);
690
691 if (!(nr_pages % m))
692 printk(KERN_INFO
693 "PM: Image saving progress: "
694 "%3d%%\n",
695 nr_pages / m * 10);
696 nr_pages++;
697 }
698 if (!off)
699 break; 489 break;
700 490
701 data[thr].unc_len = off; 491 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
702 492
703 atomic_set(&data[thr].ready, 1); 493 if (!(nr_pages % m))
704 wake_up(&data[thr].go); 494 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
495 nr_pages++;
705 } 496 }
706 497
707 if (!thr) 498 if (!off)
708 break; 499 break;
709 500
710 crc->run_threads = thr; 501 unc_len = off;
711 atomic_set(&crc->ready, 1); 502 ret = lzo1x_1_compress(unc, unc_len,
712 wake_up(&crc->go); 503 cmp + LZO_HEADER, &cmp_len, wrk);
504 if (ret < 0) {
505 printk(KERN_ERR "PM: LZO compression failed\n");
506 break;
507 }
713 508
714 for (run_threads = thr, thr = 0; thr < run_threads; thr++) { 509 if (unlikely(!cmp_len ||
715 wait_event(data[thr].done, 510 cmp_len > lzo1x_worst_compress(unc_len))) {
716 atomic_read(&data[thr].stop)); 511 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
717 atomic_set(&data[thr].stop, 0); 512 ret = -1;
513 break;
514 }
718 515
719 ret = data[thr].ret; 516 *(size_t *)cmp = cmp_len;
720 517
721 if (ret < 0) { 518 /*
722 printk(KERN_ERR "PM: LZO compression failed\n"); 519 * Given we are writing one page at a time to disk, we copy
723 goto out_finish; 520 * that much from the buffer, although the last bit will likely
724 } 521 * be smaller than full page. This is OK - we saved the length
522 * of the compressed data, so any garbage at the end will be
523 * discarded when we read it.
524 */
525 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
526 memcpy(page, cmp + off, PAGE_SIZE);
725 527
726 if (unlikely(!data[thr].cmp_len || 528 ret = swap_write_page(handle, page, &bio);
727 data[thr].cmp_len > 529 if (ret)
728 lzo1x_worst_compress(data[thr].unc_len))) {
729 printk(KERN_ERR
730 "PM: Invalid LZO compressed length\n");
731 ret = -1;
732 goto out_finish; 530 goto out_finish;
733 }
734
735 *(size_t *)data[thr].cmp = data[thr].cmp_len;
736
737 /*
738 * Given we are writing one page at a time to disk, we
739 * copy that much from the buffer, although the last
740 * bit will likely be smaller than full page. This is
741 * OK - we saved the length of the compressed data, so
742 * any garbage at the end will be discarded when we
743 * read it.
744 */
745 for (off = 0;
746 off < LZO_HEADER + data[thr].cmp_len;
747 off += PAGE_SIZE) {
748 memcpy(page, data[thr].cmp + off, PAGE_SIZE);
749
750 ret = swap_write_page(handle, page, &bio);
751 if (ret)
752 goto out_finish;
753 }
754 } 531 }
755
756 wait_event(crc->done, atomic_read(&crc->stop));
757 atomic_set(&crc->stop, 0);
758 } 532 }
759 533
760out_finish: 534out_finish:
@@ -763,21 +537,15 @@ out_finish:
763 if (!ret) 537 if (!ret)
764 ret = err2; 538 ret = err2;
765 if (!ret) 539 if (!ret)
766 printk(KERN_INFO "PM: Image saving done.\n"); 540 printk(KERN_CONT "\b\b\b\bdone\n");
541 else
542 printk(KERN_CONT "\n");
767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 543 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
768out_clean: 544
769 if (crc) { 545 vfree(cmp);
770 if (crc->thr) 546 vfree(unc);
771 kthread_stop(crc->thr); 547 vfree(wrk);
772 kfree(crc); 548 free_page((unsigned long)page);
773 }
774 if (data) {
775 for (thr = 0; thr < nr_threads; thr++)
776 if (data[thr].thr)
777 kthread_stop(data[thr].thr);
778 vfree(data);
779 }
780 if (page) free_page((unsigned long)page);
781 549
782 return ret; 550 return ret;
783} 551}
@@ -796,7 +564,8 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
796 564
797 pr_debug("PM: Free swap pages: %u\n", free_swap); 565 pr_debug("PM: Free swap pages: %u\n", free_swap);
798 566
799 required = PAGES_FOR_IO + nr_pages; 567 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
568 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
800 return free_swap > required; 569 return free_swap > required;
801} 570}
802 571
@@ -824,12 +593,10 @@ int swsusp_write(unsigned int flags)
824 printk(KERN_ERR "PM: Cannot get swap writer\n"); 593 printk(KERN_ERR "PM: Cannot get swap writer\n");
825 return error; 594 return error;
826 } 595 }
827 if (flags & SF_NOCOMPRESS_MODE) { 596 if (!enough_swap(pages, flags)) {
828 if (!enough_swap(pages, flags)) { 597 printk(KERN_ERR "PM: Not enough free swap\n");
829 printk(KERN_ERR "PM: Not enough free swap\n"); 598 error = -ENOSPC;
830 error = -ENOSPC; 599 goto out_finish;
831 goto out_finish;
832 }
833 } 600 }
834 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 601 memset(&snapshot, 0, sizeof(struct snapshot_handle));
835 error = snapshot_read_next(&snapshot); 602 error = snapshot_read_next(&snapshot);
@@ -858,15 +625,8 @@ out_finish:
858 625
859static void release_swap_reader(struct swap_map_handle *handle) 626static void release_swap_reader(struct swap_map_handle *handle)
860{ 627{
861 struct swap_map_page_list *tmp; 628 if (handle->cur)
862 629 free_page((unsigned long)handle->cur);
863 while (handle->maps) {
864 if (handle->maps->map)
865 free_page((unsigned long)handle->maps->map);
866 tmp = handle->maps;
867 handle->maps = handle->maps->next;
868 kfree(tmp);
869 }
870 handle->cur = NULL; 630 handle->cur = NULL;
871} 631}
872 632
@@ -874,46 +634,22 @@ static int get_swap_reader(struct swap_map_handle *handle,
874 unsigned int *flags_p) 634 unsigned int *flags_p)
875{ 635{
876 int error; 636 int error;
877 struct swap_map_page_list *tmp, *last;
878 sector_t offset;
879 637
880 *flags_p = swsusp_header->flags; 638 *flags_p = swsusp_header->flags;
881 639
882 if (!swsusp_header->image) /* how can this happen? */ 640 if (!swsusp_header->image) /* how can this happen? */
883 return -EINVAL; 641 return -EINVAL;
884 642
885 handle->cur = NULL; 643 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
886 last = handle->maps = NULL; 644 if (!handle->cur)
887 offset = swsusp_header->image; 645 return -ENOMEM;
888 while (offset) {
889 tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
890 if (!tmp) {
891 release_swap_reader(handle);
892 return -ENOMEM;
893 }
894 memset(tmp, 0, sizeof(*tmp));
895 if (!handle->maps)
896 handle->maps = tmp;
897 if (last)
898 last->next = tmp;
899 last = tmp;
900
901 tmp->map = (struct swap_map_page *)
902 __get_free_page(__GFP_WAIT | __GFP_HIGH);
903 if (!tmp->map) {
904 release_swap_reader(handle);
905 return -ENOMEM;
906 }
907 646
908 error = hib_bio_read_page(offset, tmp->map, NULL); 647 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
909 if (error) { 648 if (error) {
910 release_swap_reader(handle); 649 release_swap_reader(handle);
911 return error; 650 return error;
912 }
913 offset = tmp->map->next_swap;
914 } 651 }
915 handle->k = 0; 652 handle->k = 0;
916 handle->cur = handle->maps->map;
917 return 0; 653 return 0;
918} 654}
919 655
@@ -922,7 +658,6 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
922{ 658{
923 sector_t offset; 659 sector_t offset;
924 int error; 660 int error;
925 struct swap_map_page_list *tmp;
926 661
927 if (!handle->cur) 662 if (!handle->cur)
928 return -EINVAL; 663 return -EINVAL;
@@ -933,15 +668,13 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
933 if (error) 668 if (error)
934 return error; 669 return error;
935 if (++handle->k >= MAP_PAGE_ENTRIES) { 670 if (++handle->k >= MAP_PAGE_ENTRIES) {
671 error = hib_wait_on_bio_chain(bio_chain);
936 handle->k = 0; 672 handle->k = 0;
937 free_page((unsigned long)handle->maps->map); 673 offset = handle->cur->next_swap;
938 tmp = handle->maps; 674 if (!offset)
939 handle->maps = handle->maps->next;
940 kfree(tmp);
941 if (!handle->maps)
942 release_swap_reader(handle); 675 release_swap_reader(handle);
943 else 676 else if (!error)
944 handle->cur = handle->maps->map; 677 error = hib_bio_read_page(offset, handle->cur, NULL);
945 } 678 }
946 return error; 679 return error;
947} 680}
@@ -964,93 +697,49 @@ static int load_image(struct swap_map_handle *handle,
964 unsigned int nr_to_read) 697 unsigned int nr_to_read)
965{ 698{
966 unsigned int m; 699 unsigned int m;
967 int ret = 0; 700 int error = 0;
968 struct timeval start; 701 struct timeval start;
969 struct timeval stop; 702 struct timeval stop;
970 struct bio *bio; 703 struct bio *bio;
971 int err2; 704 int err2;
972 unsigned nr_pages; 705 unsigned nr_pages;
973 706
974 printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", 707 printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
975 nr_to_read); 708 nr_to_read);
976 m = nr_to_read / 10; 709 m = nr_to_read / 100;
977 if (!m) 710 if (!m)
978 m = 1; 711 m = 1;
979 nr_pages = 0; 712 nr_pages = 0;
980 bio = NULL; 713 bio = NULL;
981 do_gettimeofday(&start); 714 do_gettimeofday(&start);
982 for ( ; ; ) { 715 for ( ; ; ) {
983 ret = snapshot_write_next(snapshot); 716 error = snapshot_write_next(snapshot);
984 if (ret <= 0) 717 if (error <= 0)
985 break; 718 break;
986 ret = swap_read_page(handle, data_of(*snapshot), &bio); 719 error = swap_read_page(handle, data_of(*snapshot), &bio);
987 if (ret) 720 if (error)
988 break; 721 break;
989 if (snapshot->sync_read) 722 if (snapshot->sync_read)
990 ret = hib_wait_on_bio_chain(&bio); 723 error = hib_wait_on_bio_chain(&bio);
991 if (ret) 724 if (error)
992 break; 725 break;
993 if (!(nr_pages % m)) 726 if (!(nr_pages % m))
994 printk(KERN_INFO "PM: Image loading progress: %3d%%\n", 727 printk("\b\b\b\b%3d%%", nr_pages / m);
995 nr_pages / m * 10);
996 nr_pages++; 728 nr_pages++;
997 } 729 }
998 err2 = hib_wait_on_bio_chain(&bio); 730 err2 = hib_wait_on_bio_chain(&bio);
999 do_gettimeofday(&stop); 731 do_gettimeofday(&stop);
1000 if (!ret) 732 if (!error)
1001 ret = err2; 733 error = err2;
1002 if (!ret) { 734 if (!error) {
1003 printk(KERN_INFO "PM: Image loading done.\n"); 735 printk("\b\b\b\bdone\n");
1004 snapshot_write_finalize(snapshot); 736 snapshot_write_finalize(snapshot);
1005 if (!snapshot_image_loaded(snapshot)) 737 if (!snapshot_image_loaded(snapshot))
1006 ret = -ENODATA; 738 error = -ENODATA;
1007 } 739 } else
740 printk("\n");
1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 741 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1009 return ret; 742 return error;
1010}
1011
1012/**
1013 * Structure used for LZO data decompression.
1014 */
1015struct dec_data {
1016 struct task_struct *thr; /* thread */
1017 atomic_t ready; /* ready to start flag */
1018 atomic_t stop; /* ready to stop flag */
1019 int ret; /* return code */
1020 wait_queue_head_t go; /* start decompression */
1021 wait_queue_head_t done; /* decompression done */
1022 size_t unc_len; /* uncompressed length */
1023 size_t cmp_len; /* compressed length */
1024 unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
1025 unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
1026};
1027
1028/**
1029 * Deompression function that runs in its own thread.
1030 */
1031static int lzo_decompress_threadfn(void *data)
1032{
1033 struct dec_data *d = data;
1034
1035 while (1) {
1036 wait_event(d->go, atomic_read(&d->ready) ||
1037 kthread_should_stop());
1038 if (kthread_should_stop()) {
1039 d->thr = NULL;
1040 d->ret = -1;
1041 atomic_set(&d->stop, 1);
1042 wake_up(&d->done);
1043 break;
1044 }
1045 atomic_set(&d->ready, 0);
1046
1047 d->unc_len = LZO_UNC_SIZE;
1048 d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
1049 d->unc, &d->unc_len);
1050 atomic_set(&d->stop, 1);
1051 wake_up(&d->done);
1052 }
1053 return 0;
1054} 743}
1055 744
1056/** 745/**
@@ -1064,319 +753,136 @@ static int load_image_lzo(struct swap_map_handle *handle,
1064 unsigned int nr_to_read) 753 unsigned int nr_to_read)
1065{ 754{
1066 unsigned int m; 755 unsigned int m;
1067 int ret = 0; 756 int error = 0;
1068 int eof = 0;
1069 struct bio *bio; 757 struct bio *bio;
1070 struct timeval start; 758 struct timeval start;
1071 struct timeval stop; 759 struct timeval stop;
1072 unsigned nr_pages; 760 unsigned nr_pages;
1073 size_t off; 761 size_t i, off, unc_len, cmp_len;
1074 unsigned i, thr, run_threads, nr_threads; 762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
1075 unsigned ring = 0, pg = 0, ring_size = 0,
1076 have = 0, want, need, asked = 0;
1077 unsigned long read_pages = 0;
1078 unsigned char **page = NULL;
1079 struct dec_data *data = NULL;
1080 struct crc_data *crc = NULL;
1081
1082 /*
1083 * We'll limit the number of threads for decompression to limit memory
1084 * footprint.
1085 */
1086 nr_threads = num_online_cpus() - 1;
1087 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1088
1089 page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
1090 if (!page) {
1091 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1092 ret = -ENOMEM;
1093 goto out_clean;
1094 }
1095 763
1096 data = vmalloc(sizeof(*data) * nr_threads); 764 for (i = 0; i < LZO_CMP_PAGES; i++) {
1097 if (!data) { 765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
1098 printk(KERN_ERR "PM: Failed to allocate LZO data\n"); 766 if (!page[i]) {
1099 ret = -ENOMEM; 767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1100 goto out_clean;
1101 }
1102 for (thr = 0; thr < nr_threads; thr++)
1103 memset(&data[thr], 0, offsetof(struct dec_data, go));
1104 768
1105 crc = kmalloc(sizeof(*crc), GFP_KERNEL); 769 while (i)
1106 if (!crc) { 770 free_page((unsigned long)page[--i]);
1107 printk(KERN_ERR "PM: Failed to allocate crc\n"); 771
1108 ret = -ENOMEM; 772 return -ENOMEM;
1109 goto out_clean;
1110 }
1111 memset(crc, 0, offsetof(struct crc_data, go));
1112
1113 /*
1114 * Start the decompression threads.
1115 */
1116 for (thr = 0; thr < nr_threads; thr++) {
1117 init_waitqueue_head(&data[thr].go);
1118 init_waitqueue_head(&data[thr].done);
1119
1120 data[thr].thr = kthread_run(lzo_decompress_threadfn,
1121 &data[thr],
1122 "image_decompress/%u", thr);
1123 if (IS_ERR(data[thr].thr)) {
1124 data[thr].thr = NULL;
1125 printk(KERN_ERR
1126 "PM: Cannot start decompression threads\n");
1127 ret = -ENOMEM;
1128 goto out_clean;
1129 } 773 }
1130 } 774 }
1131 775
1132 /* 776 unc = vmalloc(LZO_UNC_SIZE);
1133 * Start the CRC32 thread. 777 if (!unc) {
1134 */ 778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
1135 init_waitqueue_head(&crc->go);
1136 init_waitqueue_head(&crc->done);
1137
1138 handle->crc32 = 0;
1139 crc->crc32 = &handle->crc32;
1140 for (thr = 0; thr < nr_threads; thr++) {
1141 crc->unc[thr] = data[thr].unc;
1142 crc->unc_len[thr] = &data[thr].unc_len;
1143 }
1144 779
1145 crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); 780 for (i = 0; i < LZO_CMP_PAGES; i++)
1146 if (IS_ERR(crc->thr)) { 781 free_page((unsigned long)page[i]);
1147 crc->thr = NULL; 782
1148 printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); 783 return -ENOMEM;
1149 ret = -ENOMEM;
1150 goto out_clean;
1151 } 784 }
1152 785
1153 /* 786 cmp = vmalloc(LZO_CMP_SIZE);
1154 * Set the number of pages for read buffering. 787 if (!cmp) {
1155 * This is complete guesswork, because we'll only know the real 788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
1156 * picture once prepare_image() is called, which is much later on
1157 * during the image load phase. We'll assume the worst case and
1158 * say that none of the image pages are from high memory.
1159 */
1160 if (low_free_pages() > snapshot_get_image_size())
1161 read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
1162 read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
1163
1164 for (i = 0; i < read_pages; i++) {
1165 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1166 __GFP_WAIT | __GFP_HIGH :
1167 __GFP_WAIT | __GFP_NOWARN |
1168 __GFP_NORETRY);
1169 789
1170 if (!page[i]) { 790 vfree(unc);
1171 if (i < LZO_CMP_PAGES) { 791 for (i = 0; i < LZO_CMP_PAGES; i++)
1172 ring_size = i; 792 free_page((unsigned long)page[i]);
1173 printk(KERN_ERR 793
1174 "PM: Failed to allocate LZO pages\n"); 794 return -ENOMEM;
1175 ret = -ENOMEM;
1176 goto out_clean;
1177 } else {
1178 break;
1179 }
1180 }
1181 } 795 }
1182 want = ring_size = i;
1183 796
1184 printk(KERN_INFO 797 printk(KERN_INFO
1185 "PM: Using %u thread(s) for decompression.\n" 798 "PM: Loading and decompressing image data (%u pages) ... ",
1186 "PM: Loading and decompressing image data (%u pages)...\n", 799 nr_to_read);
1187 nr_threads, nr_to_read); 800 m = nr_to_read / 100;
1188 m = nr_to_read / 10;
1189 if (!m) 801 if (!m)
1190 m = 1; 802 m = 1;
1191 nr_pages = 0; 803 nr_pages = 0;
1192 bio = NULL; 804 bio = NULL;
1193 do_gettimeofday(&start); 805 do_gettimeofday(&start);
1194 806
1195 ret = snapshot_write_next(snapshot); 807 error = snapshot_write_next(snapshot);
1196 if (ret <= 0) 808 if (error <= 0)
1197 goto out_finish; 809 goto out_finish;
1198 810
1199 for(;;) { 811 for (;;) {
1200 for (i = 0; !eof && i < want; i++) { 812 error = swap_read_page(handle, page[0], NULL); /* sync */
1201 ret = swap_read_page(handle, page[ring], &bio); 813 if (error)
1202 if (ret) { 814 break;
1203 /*
1204 * On real read error, finish. On end of data,
1205 * set EOF flag and just exit the read loop.
1206 */
1207 if (handle->cur &&
1208 handle->cur->entries[handle->k]) {
1209 goto out_finish;
1210 } else {
1211 eof = 1;
1212 break;
1213 }
1214 }
1215 if (++ring >= ring_size)
1216 ring = 0;
1217 }
1218 asked += i;
1219 want -= i;
1220
1221 /*
1222 * We are out of data, wait for some more.
1223 */
1224 if (!have) {
1225 if (!asked)
1226 break;
1227
1228 ret = hib_wait_on_bio_chain(&bio);
1229 if (ret)
1230 goto out_finish;
1231 have += asked;
1232 asked = 0;
1233 if (eof)
1234 eof = 2;
1235 }
1236 815
1237 if (crc->run_threads) { 816 cmp_len = *(size_t *)page[0];
1238 wait_event(crc->done, atomic_read(&crc->stop)); 817 if (unlikely(!cmp_len ||
1239 atomic_set(&crc->stop, 0); 818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
1240 crc->run_threads = 0; 819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
820 error = -1;
821 break;
1241 } 822 }
1242 823
1243 for (thr = 0; have && thr < nr_threads; thr++) { 824 for (off = PAGE_SIZE, i = 1;
1244 data[thr].cmp_len = *(size_t *)page[pg]; 825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
1245 if (unlikely(!data[thr].cmp_len || 826 error = swap_read_page(handle, page[i], &bio);
1246 data[thr].cmp_len > 827 if (error)
1247 lzo1x_worst_compress(LZO_UNC_SIZE))) {
1248 printk(KERN_ERR
1249 "PM: Invalid LZO compressed length\n");
1250 ret = -1;
1251 goto out_finish; 828 goto out_finish;
1252 } 829 }
1253
1254 need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
1255 PAGE_SIZE);
1256 if (need > have) {
1257 if (eof > 1) {
1258 ret = -1;
1259 goto out_finish;
1260 }
1261 break;
1262 }
1263 830
1264 for (off = 0; 831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
1265 off < LZO_HEADER + data[thr].cmp_len; 832 if (error)
1266 off += PAGE_SIZE) { 833 goto out_finish;
1267 memcpy(data[thr].cmp + off,
1268 page[pg], PAGE_SIZE);
1269 have--;
1270 want++;
1271 if (++pg >= ring_size)
1272 pg = 0;
1273 }
1274 834
1275 atomic_set(&data[thr].ready, 1); 835 for (off = 0, i = 0;
1276 wake_up(&data[thr].go); 836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
1277 } 838 }
1278 839
1279 /* 840 unc_len = LZO_UNC_SIZE;
1280 * Wait for more data while we are decompressing. 841 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
1281 */ 842 unc, &unc_len);
1282 if (have < LZO_CMP_PAGES && asked) { 843 if (error < 0) {
1283 ret = hib_wait_on_bio_chain(&bio); 844 printk(KERN_ERR "PM: LZO decompression failed\n");
1284 if (ret) 845 break;
1285 goto out_finish;
1286 have += asked;
1287 asked = 0;
1288 if (eof)
1289 eof = 2;
1290 } 846 }
1291 847
1292 for (run_threads = thr, thr = 0; thr < run_threads; thr++) { 848 if (unlikely(!unc_len ||
1293 wait_event(data[thr].done, 849 unc_len > LZO_UNC_SIZE ||
1294 atomic_read(&data[thr].stop)); 850 unc_len & (PAGE_SIZE - 1))) {
1295 atomic_set(&data[thr].stop, 0); 851 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
852 error = -1;
853 break;
854 }
1296 855
1297 ret = data[thr].ret; 856 for (off = 0; off < unc_len; off += PAGE_SIZE) {
857 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
1298 858
1299 if (ret < 0) { 859 if (!(nr_pages % m))
1300 printk(KERN_ERR 860 printk("\b\b\b\b%3d%%", nr_pages / m);
1301 "PM: LZO decompression failed\n"); 861 nr_pages++;
1302 goto out_finish;
1303 }
1304 862
1305 if (unlikely(!data[thr].unc_len || 863 error = snapshot_write_next(snapshot);
1306 data[thr].unc_len > LZO_UNC_SIZE || 864 if (error <= 0)
1307 data[thr].unc_len & (PAGE_SIZE - 1))) {
1308 printk(KERN_ERR
1309 "PM: Invalid LZO uncompressed length\n");
1310 ret = -1;
1311 goto out_finish; 865 goto out_finish;
1312 }
1313
1314 for (off = 0;
1315 off < data[thr].unc_len; off += PAGE_SIZE) {
1316 memcpy(data_of(*snapshot),
1317 data[thr].unc + off, PAGE_SIZE);
1318
1319 if (!(nr_pages % m))
1320 printk(KERN_INFO
1321 "PM: Image loading progress: "
1322 "%3d%%\n",
1323 nr_pages / m * 10);
1324 nr_pages++;
1325
1326 ret = snapshot_write_next(snapshot);
1327 if (ret <= 0) {
1328 crc->run_threads = thr + 1;
1329 atomic_set(&crc->ready, 1);
1330 wake_up(&crc->go);
1331 goto out_finish;
1332 }
1333 }
1334 } 866 }
1335
1336 crc->run_threads = thr;
1337 atomic_set(&crc->ready, 1);
1338 wake_up(&crc->go);
1339 } 867 }
1340 868
1341out_finish: 869out_finish:
1342 if (crc->run_threads) {
1343 wait_event(crc->done, atomic_read(&crc->stop));
1344 atomic_set(&crc->stop, 0);
1345 }
1346 do_gettimeofday(&stop); 870 do_gettimeofday(&stop);
1347 if (!ret) { 871 if (!error) {
1348 printk(KERN_INFO "PM: Image loading done.\n"); 872 printk("\b\b\b\bdone\n");
1349 snapshot_write_finalize(snapshot); 873 snapshot_write_finalize(snapshot);
1350 if (!snapshot_image_loaded(snapshot)) 874 if (!snapshot_image_loaded(snapshot))
1351 ret = -ENODATA; 875 error = -ENODATA;
1352 if (!ret) { 876 } else
1353 if (swsusp_header->flags & SF_CRC32_MODE) { 877 printk("\n");
1354 if(handle->crc32 != swsusp_header->crc32) {
1355 printk(KERN_ERR
1356 "PM: Invalid image CRC32!\n");
1357 ret = -ENODATA;
1358 }
1359 }
1360 }
1361 }
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 878 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1363out_clean: 879
1364 for (i = 0; i < ring_size; i++) 880 vfree(cmp);
881 vfree(unc);
882 for (i = 0; i < LZO_CMP_PAGES; i++)
1365 free_page((unsigned long)page[i]); 883 free_page((unsigned long)page[i]);
1366 if (crc) {
1367 if (crc->thr)
1368 kthread_stop(crc->thr);
1369 kfree(crc);
1370 }
1371 if (data) {
1372 for (thr = 0; thr < nr_threads; thr++)
1373 if (data[thr].thr)
1374 kthread_stop(data[thr].thr);
1375 vfree(data);
1376 }
1377 if (page) vfree(page);
1378 884
1379 return ret; 885 return error;
1380} 886}
1381 887
1382/** 888/**
@@ -1472,34 +978,6 @@ void swsusp_close(fmode_t mode)
1472 blkdev_put(hib_resume_bdev, mode); 978 blkdev_put(hib_resume_bdev, mode);
1473} 979}
1474 980
1475/**
1476 * swsusp_unmark - Unmark swsusp signature in the resume device
1477 */
1478
1479#ifdef CONFIG_SUSPEND
1480int swsusp_unmark(void)
1481{
1482 int error;
1483
1484 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
1485 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
1486 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
1487 error = hib_bio_write_page(swsusp_resume_block,
1488 swsusp_header, NULL);
1489 } else {
1490 printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
1491 error = -ENODEV;
1492 }
1493
1494 /*
1495 * We just returned from suspend, we don't need the image any more.
1496 */
1497 free_all_swap_pages(root_swap);
1498
1499 return error;
1500}
1501#endif
1502
1503static int swsusp_header_init(void) 981static int swsusp_header_init(void)
1504{ 982{
1505 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); 983 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86..42ddbc6f0de 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -20,15 +20,37 @@
20#include <linux/swapops.h> 20#include <linux/swapops.h>
21#include <linux/pm.h> 21#include <linux/pm.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/compat.h>
24#include <linux/console.h> 23#include <linux/console.h>
25#include <linux/cpu.h> 24#include <linux/cpu.h>
26#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <scsi/scsi_scan.h>
27 27
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29 29
30#include "power.h" 30#include "power.h"
31 31
32/*
33 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
34 * will be removed in the future. They are only preserved here for
35 * compatibility with existing userland utilities.
36 */
37#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
38#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
39
40#define PMOPS_PREPARE 1
41#define PMOPS_ENTER 2
42#define PMOPS_FINISH 3
43
44/*
45 * NOTE: The following ioctl definitions are wrong and have been replaced with
46 * correct ones. They are only preserved here for compatibility with existing
47 * userland utilities and will be removed in the future.
48 */
49#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
50#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
51#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
52#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
53
32 54
33#define SNAPSHOT_MINOR 231 55#define SNAPSHOT_MINOR 231
34 56
@@ -48,7 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
48 struct snapshot_data *data; 70 struct snapshot_data *data;
49 int error; 71 int error;
50 72
51 lock_system_sleep(); 73 mutex_lock(&pm_mutex);
52 74
53 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 75 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
54 error = -EBUSY; 76 error = -EBUSY;
@@ -83,6 +105,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
83 * appear. 105 * appear.
84 */ 106 */
85 wait_for_device_probe(); 107 wait_for_device_probe();
108 scsi_complete_async_scans();
86 109
87 data->swap = -1; 110 data->swap = -1;
88 data->mode = O_WRONLY; 111 data->mode = O_WRONLY;
@@ -99,7 +122,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
99 data->platform_support = 0; 122 data->platform_support = 0;
100 123
101 Unlock: 124 Unlock:
102 unlock_system_sleep(); 125 mutex_unlock(&pm_mutex);
103 126
104 return error; 127 return error;
105} 128}
@@ -108,7 +131,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
108{ 131{
109 struct snapshot_data *data; 132 struct snapshot_data *data;
110 133
111 lock_system_sleep(); 134 mutex_lock(&pm_mutex);
112 135
113 swsusp_free(); 136 swsusp_free();
114 free_basic_memory_bitmaps(); 137 free_basic_memory_bitmaps();
@@ -122,7 +145,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
122 PM_POST_HIBERNATION : PM_POST_RESTORE); 145 PM_POST_HIBERNATION : PM_POST_RESTORE);
123 atomic_inc(&snapshot_device_available); 146 atomic_inc(&snapshot_device_available);
124 147
125 unlock_system_sleep(); 148 mutex_unlock(&pm_mutex);
126 149
127 return 0; 150 return 0;
128} 151}
@@ -134,7 +157,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
134 ssize_t res; 157 ssize_t res;
135 loff_t pg_offp = *offp & ~PAGE_MASK; 158 loff_t pg_offp = *offp & ~PAGE_MASK;
136 159
137 lock_system_sleep(); 160 mutex_lock(&pm_mutex);
138 161
139 data = filp->private_data; 162 data = filp->private_data;
140 if (!data->ready) { 163 if (!data->ready) {
@@ -155,7 +178,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
155 *offp += res; 178 *offp += res;
156 179
157 Unlock: 180 Unlock:
158 unlock_system_sleep(); 181 mutex_unlock(&pm_mutex);
159 182
160 return res; 183 return res;
161} 184}
@@ -167,7 +190,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
167 ssize_t res; 190 ssize_t res;
168 loff_t pg_offp = *offp & ~PAGE_MASK; 191 loff_t pg_offp = *offp & ~PAGE_MASK;
169 192
170 lock_system_sleep(); 193 mutex_lock(&pm_mutex);
171 194
172 data = filp->private_data; 195 data = filp->private_data;
173 196
@@ -184,11 +207,20 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
184 if (res > 0) 207 if (res > 0)
185 *offp += res; 208 *offp += res;
186unlock: 209unlock:
187 unlock_system_sleep(); 210 mutex_unlock(&pm_mutex);
188 211
189 return res; 212 return res;
190} 213}
191 214
215static void snapshot_deprecated_ioctl(unsigned int cmd)
216{
217 if (printk_ratelimit())
218 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
219 "be removed soon, update your suspend-to-disk "
220 "utilities\n",
221 __builtin_return_address(0), cmd);
222}
223
192static long snapshot_ioctl(struct file *filp, unsigned int cmd, 224static long snapshot_ioctl(struct file *filp, unsigned int cmd,
193 unsigned long arg) 225 unsigned long arg)
194{ 226{
@@ -219,7 +251,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
219 sys_sync(); 251 sys_sync();
220 printk("done.\n"); 252 printk("done.\n");
221 253
254 error = usermodehelper_disable();
255 if (error)
256 break;
257
222 error = freeze_processes(); 258 error = freeze_processes();
259 if (error) {
260 thaw_processes();
261 usermodehelper_enable();
262 }
223 if (!error) 263 if (!error)
224 data->frozen = 1; 264 data->frozen = 1;
225 break; 265 break;
@@ -229,9 +269,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
229 break; 269 break;
230 pm_restore_gfp_mask(); 270 pm_restore_gfp_mask();
231 thaw_processes(); 271 thaw_processes();
272 usermodehelper_enable();
232 data->frozen = 0; 273 data->frozen = 0;
233 break; 274 break;
234 275
276 case SNAPSHOT_ATOMIC_SNAPSHOT:
277 snapshot_deprecated_ioctl(cmd);
235 case SNAPSHOT_CREATE_IMAGE: 278 case SNAPSHOT_CREATE_IMAGE:
236 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 279 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
237 error = -EPERM; 280 error = -EPERM;
@@ -239,11 +282,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
239 } 282 }
240 pm_restore_gfp_mask(); 283 pm_restore_gfp_mask();
241 error = hibernation_snapshot(data->platform_support); 284 error = hibernation_snapshot(data->platform_support);
242 if (!error) { 285 if (!error)
243 error = put_user(in_suspend, (int __user *)arg); 286 error = put_user(in_suspend, (int __user *)arg);
244 data->ready = !freezer_test_done && !error; 287 if (!error)
245 freezer_test_done = false; 288 data->ready = 1;
246 }
247 break; 289 break;
248 290
249 case SNAPSHOT_ATOMIC_RESTORE: 291 case SNAPSHOT_ATOMIC_RESTORE:
@@ -260,17 +302,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
260 swsusp_free(); 302 swsusp_free();
261 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 303 memset(&data->handle, 0, sizeof(struct snapshot_handle));
262 data->ready = 0; 304 data->ready = 0;
263 /*
264 * It is necessary to thaw kernel threads here, because
265 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
266 * SNAPSHOT_FREE. In that case, if kernel threads were not
267 * thawed, the preallocation of memory carried out by
268 * hibernation_snapshot() might run into problems (i.e. it
269 * might fail or even deadlock).
270 */
271 thaw_kernel_threads();
272 break; 305 break;
273 306
307 case SNAPSHOT_SET_IMAGE_SIZE:
308 snapshot_deprecated_ioctl(cmd);
274 case SNAPSHOT_PREF_IMAGE_SIZE: 309 case SNAPSHOT_PREF_IMAGE_SIZE:
275 image_size = arg; 310 image_size = arg;
276 break; 311 break;
@@ -285,12 +320,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
285 error = put_user(size, (loff_t __user *)arg); 320 error = put_user(size, (loff_t __user *)arg);
286 break; 321 break;
287 322
323 case SNAPSHOT_AVAIL_SWAP:
324 snapshot_deprecated_ioctl(cmd);
288 case SNAPSHOT_AVAIL_SWAP_SIZE: 325 case SNAPSHOT_AVAIL_SWAP_SIZE:
289 size = count_swap_pages(data->swap, 1); 326 size = count_swap_pages(data->swap, 1);
290 size <<= PAGE_SHIFT; 327 size <<= PAGE_SHIFT;
291 error = put_user(size, (loff_t __user *)arg); 328 error = put_user(size, (loff_t __user *)arg);
292 break; 329 break;
293 330
331 case SNAPSHOT_GET_SWAP_PAGE:
332 snapshot_deprecated_ioctl(cmd);
294 case SNAPSHOT_ALLOC_SWAP_PAGE: 333 case SNAPSHOT_ALLOC_SWAP_PAGE:
295 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 334 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
296 error = -ENODEV; 335 error = -ENODEV;
@@ -313,6 +352,27 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
313 free_all_swap_pages(data->swap); 352 free_all_swap_pages(data->swap);
314 break; 353 break;
315 354
355 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
356 snapshot_deprecated_ioctl(cmd);
357 if (!swsusp_swap_in_use()) {
358 /*
359 * User space encodes device types as two-byte values,
360 * so we need to recode them
361 */
362 if (old_decode_dev(arg)) {
363 data->swap = swap_type_of(old_decode_dev(arg),
364 0, NULL);
365 if (data->swap < 0)
366 error = -ENODEV;
367 } else {
368 data->swap = -1;
369 error = -EINVAL;
370 }
371 } else {
372 error = -EPERM;
373 }
374 break;
375
316 case SNAPSHOT_S2RAM: 376 case SNAPSHOT_S2RAM:
317 if (!data->frozen) { 377 if (!data->frozen) {
318 error = -EPERM; 378 error = -EPERM;
@@ -335,6 +395,33 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
335 error = hibernation_platform_enter(); 395 error = hibernation_platform_enter();
336 break; 396 break;
337 397
398 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
399 snapshot_deprecated_ioctl(cmd);
400 error = -EINVAL;
401
402 switch (arg) {
403
404 case PMOPS_PREPARE:
405 data->platform_support = 1;
406 error = 0;
407 break;
408
409 case PMOPS_ENTER:
410 if (data->platform_support)
411 error = hibernation_platform_enter();
412 break;
413
414 case PMOPS_FINISH:
415 if (data->platform_support)
416 error = 0;
417 break;
418
419 default:
420 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
421
422 }
423 break;
424
338 case SNAPSHOT_SET_SWAP_AREA: 425 case SNAPSHOT_SET_SWAP_AREA:
339 if (swsusp_swap_in_use()) { 426 if (swsusp_swap_in_use()) {
340 error = -EPERM; 427 error = -EPERM;
@@ -376,66 +463,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
376 return error; 463 return error;
377} 464}
378 465
379#ifdef CONFIG_COMPAT
380
381struct compat_resume_swap_area {
382 compat_loff_t offset;
383 u32 dev;
384} __packed;
385
386static long
387snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
388{
389 BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
390
391 switch (cmd) {
392 case SNAPSHOT_GET_IMAGE_SIZE:
393 case SNAPSHOT_AVAIL_SWAP_SIZE:
394 case SNAPSHOT_ALLOC_SWAP_PAGE: {
395 compat_loff_t __user *uoffset = compat_ptr(arg);
396 loff_t offset;
397 mm_segment_t old_fs;
398 int err;
399
400 old_fs = get_fs();
401 set_fs(KERNEL_DS);
402 err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
403 set_fs(old_fs);
404 if (!err && put_user(offset, uoffset))
405 err = -EFAULT;
406 return err;
407 }
408
409 case SNAPSHOT_CREATE_IMAGE:
410 return snapshot_ioctl(file, cmd,
411 (unsigned long) compat_ptr(arg));
412
413 case SNAPSHOT_SET_SWAP_AREA: {
414 struct compat_resume_swap_area __user *u_swap_area =
415 compat_ptr(arg);
416 struct resume_swap_area swap_area;
417 mm_segment_t old_fs;
418 int err;
419
420 err = get_user(swap_area.offset, &u_swap_area->offset);
421 err |= get_user(swap_area.dev, &u_swap_area->dev);
422 if (err)
423 return -EFAULT;
424 old_fs = get_fs();
425 set_fs(KERNEL_DS);
426 err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
427 (unsigned long) &swap_area);
428 set_fs(old_fs);
429 return err;
430 }
431
432 default:
433 return snapshot_ioctl(file, cmd, arg);
434 }
435}
436
437#endif /* CONFIG_COMPAT */
438
439static const struct file_operations snapshot_fops = { 466static const struct file_operations snapshot_fops = {
440 .open = snapshot_open, 467 .open = snapshot_open,
441 .release = snapshot_release, 468 .release = snapshot_release,
@@ -443,9 +470,6 @@ static const struct file_operations snapshot_fops = {
443 .write = snapshot_write, 470 .write = snapshot_write,
444 .llseek = no_llseek, 471 .llseek = no_llseek,
445 .unlocked_ioctl = snapshot_ioctl, 472 .unlocked_ioctl = snapshot_ioctl,
446#ifdef CONFIG_COMPAT
447 .compat_ioctl = snapshot_compat_ioctl,
448#endif
449}; 473};
450 474
451static struct miscdevice snapshot_device = { 475static struct miscdevice snapshot_device = {
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 8f50de394d2..81e1b7c65ca 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -1,266 +1,634 @@
1/* 1/* kernel/power/wakelock.c
2 * kernel/power/wakelock.c
3 * 2 *
4 * User space wakeup sources support. 3 * Copyright (C) 2005-2008 Google, Inc.
5 * 4 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> 5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
7 * 13 *
8 * This code is based on the analogous interface allowing user space to
9 * manipulate wakelocks on Android.
10 */ 14 */
11 15
12#include <linux/capability.h> 16#include <linux/module.h>
13#include <linux/ctype.h> 17#include <linux/platform_device.h>
14#include <linux/device.h> 18#include <linux/rtc.h>
15#include <linux/err.h> 19#include <linux/suspend.h>
16#include <linux/hrtimer.h> 20#include <linux/syscalls.h> /* sys_sync */
17#include <linux/list.h> 21#include <linux/wakelock.h>
18#include <linux/rbtree.h> 22#ifdef CONFIG_WAKELOCK_STAT
19#include <linux/slab.h> 23#include <linux/proc_fs.h>
20
21static DEFINE_MUTEX(wakelocks_lock);
22
23struct wakelock {
24 char *name;
25 struct rb_node node;
26 struct wakeup_source ws;
27#ifdef CONFIG_PM_WAKELOCKS_GC
28 struct list_head lru;
29#endif 24#endif
25#include "power.h"
26
27enum {
28 DEBUG_EXIT_SUSPEND = 1U << 0,
29 DEBUG_WAKEUP = 1U << 1,
30 DEBUG_SUSPEND = 1U << 2,
31 DEBUG_EXPIRE = 1U << 3,
32 DEBUG_WAKE_LOCK = 1U << 4,
30}; 33};
34static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP;
35module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
36
37#define WAKE_LOCK_TYPE_MASK (0x0f)
38#define WAKE_LOCK_INITIALIZED (1U << 8)
39#define WAKE_LOCK_ACTIVE (1U << 9)
40#define WAKE_LOCK_AUTO_EXPIRE (1U << 10)
41#define WAKE_LOCK_PREVENTING_SUSPEND (1U << 11)
42
43static DEFINE_SPINLOCK(list_lock);
44static LIST_HEAD(inactive_locks);
45static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT];
46static int current_event_num;
47struct workqueue_struct *suspend_work_queue;
48struct wake_lock main_wake_lock;
49suspend_state_t requested_suspend_state = PM_SUSPEND_MEM;
50static struct wake_lock unknown_wakeup;
51static struct wake_lock suspend_backoff_lock;
52
53#define SUSPEND_BACKOFF_THRESHOLD 10
54#define SUSPEND_BACKOFF_INTERVAL 10000
55
56static unsigned suspend_short_count;
57
58#ifdef CONFIG_WAKELOCK_STAT
59static struct wake_lock deleted_wake_locks;
60static ktime_t last_sleep_time_update;
61static int wait_for_wakeup;
62
63int get_expired_time(struct wake_lock *lock, ktime_t *expire_time)
64{
65 struct timespec ts;
66 struct timespec kt;
67 struct timespec tomono;
68 struct timespec delta;
69 struct timespec sleep;
70 long timeout;
71
72 if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE))
73 return 0;
74 get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep);
75 timeout = lock->expires - jiffies;
76 if (timeout > 0)
77 return 0;
78 jiffies_to_timespec(-timeout, &delta);
79 set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec,
80 kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec);
81 *expire_time = timespec_to_ktime(ts);
82 return 1;
83}
31 84
32static struct rb_root wakelocks_tree = RB_ROOT;
33 85
34ssize_t pm_show_wakelocks(char *buf, bool show_active) 86static int print_lock_stat(struct seq_file *m, struct wake_lock *lock)
35{ 87{
36 struct rb_node *node; 88 int lock_count = lock->stat.count;
37 struct wakelock *wl; 89 int expire_count = lock->stat.expire_count;
38 char *str = buf; 90 ktime_t active_time = ktime_set(0, 0);
39 char *end = buf + PAGE_SIZE; 91 ktime_t total_time = lock->stat.total_time;
92 ktime_t max_time = lock->stat.max_time;
93
94 ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time;
95 if (lock->flags & WAKE_LOCK_ACTIVE) {
96 ktime_t now, add_time;
97 int expired = get_expired_time(lock, &now);
98 if (!expired)
99 now = ktime_get();
100 add_time = ktime_sub(now, lock->stat.last_time);
101 lock_count++;
102 if (!expired)
103 active_time = add_time;
104 else
105 expire_count++;
106 total_time = ktime_add(total_time, add_time);
107 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND)
108 prevent_suspend_time = ktime_add(prevent_suspend_time,
109 ktime_sub(now, last_sleep_time_update));
110 if (add_time.tv64 > max_time.tv64)
111 max_time = add_time;
112 }
40 113
41 mutex_lock(&wakelocks_lock); 114 return seq_printf(m,
115 "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n",
116 lock->name, lock_count, expire_count,
117 lock->stat.wakeup_count, ktime_to_ns(active_time),
118 ktime_to_ns(total_time),
119 ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time),
120 ktime_to_ns(lock->stat.last_time));
121}
42 122
43 for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { 123static int wakelock_stats_show(struct seq_file *m, void *unused)
44 wl = rb_entry(node, struct wakelock, node); 124{
45 if (wl->ws.active == show_active) 125 unsigned long irqflags;
46 str += scnprintf(str, end - str, "%s ", wl->name); 126 struct wake_lock *lock;
127 int ret;
128 int type;
129
130 spin_lock_irqsave(&list_lock, irqflags);
131
132 ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since"
133 "\ttotal_time\tsleep_time\tmax_time\tlast_change\n");
134 list_for_each_entry(lock, &inactive_locks, link)
135 ret = print_lock_stat(m, lock);
136 for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) {
137 list_for_each_entry(lock, &active_wake_locks[type], link)
138 ret = print_lock_stat(m, lock);
47 } 139 }
48 if (str > buf) 140 spin_unlock_irqrestore(&list_lock, irqflags);
49 str--; 141 return 0;
142}
50 143
51 str += scnprintf(str, end - str, "\n"); 144static void wake_unlock_stat_locked(struct wake_lock *lock, int expired)
145{
146 ktime_t duration;
147 ktime_t now;
148 if (!(lock->flags & WAKE_LOCK_ACTIVE))
149 return;
150 if (get_expired_time(lock, &now))
151 expired = 1;
152 else
153 now = ktime_get();
154 lock->stat.count++;
155 if (expired)
156 lock->stat.expire_count++;
157 duration = ktime_sub(now, lock->stat.last_time);
158 lock->stat.total_time = ktime_add(lock->stat.total_time, duration);
159 if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time))
160 lock->stat.max_time = duration;
161 lock->stat.last_time = ktime_get();
162 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
163 duration = ktime_sub(now, last_sleep_time_update);
164 lock->stat.prevent_suspend_time = ktime_add(
165 lock->stat.prevent_suspend_time, duration);
166 lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
167 }
168}
52 169
53 mutex_unlock(&wakelocks_lock); 170static void update_sleep_wait_stats_locked(int done)
54 return (str - buf); 171{
172 struct wake_lock *lock;
173 ktime_t now, etime, elapsed, add;
174 int expired;
175
176 now = ktime_get();
177 elapsed = ktime_sub(now, last_sleep_time_update);
178 list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) {
179 expired = get_expired_time(lock, &etime);
180 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
181 if (expired)
182 add = ktime_sub(etime, last_sleep_time_update);
183 else
184 add = elapsed;
185 lock->stat.prevent_suspend_time = ktime_add(
186 lock->stat.prevent_suspend_time, add);
187 }
188 if (done || expired)
189 lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
190 else
191 lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND;
192 }
193 last_sleep_time_update = now;
55} 194}
195#endif
56 196
57#if CONFIG_PM_WAKELOCKS_LIMIT > 0
58static unsigned int number_of_wakelocks;
59 197
60static inline bool wakelocks_limit_exceeded(void) 198static void expire_wake_lock(struct wake_lock *lock)
61{ 199{
62 return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; 200#ifdef CONFIG_WAKELOCK_STAT
201 wake_unlock_stat_locked(lock, 1);
202#endif
203 lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
204 list_del(&lock->link);
205 list_add(&lock->link, &inactive_locks);
206 if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE))
207 pr_info("expired wake lock %s\n", lock->name);
63} 208}
64 209
65static inline void increment_wakelocks_number(void) 210/* Caller must acquire the list_lock spinlock */
211static void print_active_locks(int type)
66{ 212{
67 number_of_wakelocks++; 213 struct wake_lock *lock;
214 bool print_expired = true;
215
216 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
217 list_for_each_entry(lock, &active_wake_locks[type], link) {
218 if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
219 long timeout = lock->expires - jiffies;
220 if (timeout > 0)
221 pr_info("active wake lock %s, time left %ld\n",
222 lock->name, timeout);
223 else if (print_expired)
224 pr_info("wake lock %s, expired\n", lock->name);
225 } else {
226 pr_info("active wake lock %s\n", lock->name);
227 if (!(debug_mask & DEBUG_EXPIRE))
228 print_expired = false;
229 }
230 }
68} 231}
69 232
70static inline void decrement_wakelocks_number(void) 233static long has_wake_lock_locked(int type)
71{ 234{
72 number_of_wakelocks--; 235 struct wake_lock *lock, *n;
236 long max_timeout = 0;
237
238 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
239 list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) {
240 if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
241 long timeout = lock->expires - jiffies;
242 if (timeout <= 0)
243 expire_wake_lock(lock);
244 else if (timeout > max_timeout)
245 max_timeout = timeout;
246 } else
247 return -1;
248 }
249 return max_timeout;
73} 250}
74#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
75static inline bool wakelocks_limit_exceeded(void) { return false; }
76static inline void increment_wakelocks_number(void) {}
77static inline void decrement_wakelocks_number(void) {}
78#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
79 251
80#ifdef CONFIG_PM_WAKELOCKS_GC 252long has_wake_lock(int type)
81#define WL_GC_COUNT_MAX 100
82#define WL_GC_TIME_SEC 300
83
84static LIST_HEAD(wakelocks_lru_list);
85static unsigned int wakelocks_gc_count;
86
87static inline void wakelocks_lru_add(struct wakelock *wl)
88{ 253{
89 list_add(&wl->lru, &wakelocks_lru_list); 254 long ret;
255 unsigned long irqflags;
256 spin_lock_irqsave(&list_lock, irqflags);
257 ret = has_wake_lock_locked(type);
258 if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND)
259 print_active_locks(type);
260 spin_unlock_irqrestore(&list_lock, irqflags);
261 return ret;
90} 262}
91 263
92static inline void wakelocks_lru_most_recent(struct wakelock *wl) 264static void suspend_backoff(void)
93{ 265{
94 list_move(&wl->lru, &wakelocks_lru_list); 266 pr_info("suspend: too many immediate wakeups, back off\n");
267 wake_lock_timeout(&suspend_backoff_lock,
268 msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL));
95} 269}
96 270
97static void wakelocks_gc(void) 271static void suspend(struct work_struct *work)
98{ 272{
99 struct wakelock *wl, *aux; 273 int ret;
100 ktime_t now; 274 int entry_event_num;
275 struct timespec ts_entry, ts_exit;
101 276
102 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) 277 if (has_wake_lock(WAKE_LOCK_SUSPEND)) {
278 if (debug_mask & DEBUG_SUSPEND)
279 pr_info("suspend: abort suspend\n");
103 return; 280 return;
281 }
104 282
105 now = ktime_get(); 283 entry_event_num = current_event_num;
106 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { 284 sys_sync();
107 u64 idle_time_ns; 285 if (debug_mask & DEBUG_SUSPEND)
108 bool active; 286 pr_info("suspend: enter suspend\n");
109 287 getnstimeofday(&ts_entry);
110 spin_lock_irq(&wl->ws.lock); 288 ret = pm_suspend(requested_suspend_state);
111 idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); 289 getnstimeofday(&ts_exit);
112 active = wl->ws.active; 290
113 spin_unlock_irq(&wl->ws.lock); 291 if (debug_mask & DEBUG_EXIT_SUSPEND) {
114 292 struct rtc_time tm;
115 if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) 293 rtc_time_to_tm(ts_exit.tv_sec, &tm);
116 break; 294 pr_info("suspend: exit suspend, ret = %d "
117 295 "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret,
118 if (!active) { 296 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
119 wakeup_source_remove(&wl->ws); 297 tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec);
120 rb_erase(&wl->node, &wakelocks_tree);
121 list_del(&wl->lru);
122 kfree(wl->name);
123 kfree(wl);
124 decrement_wakelocks_number();
125 }
126 } 298 }
127 wakelocks_gc_count = 0; 299
128} 300 if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) {
129#else /* !CONFIG_PM_WAKELOCKS_GC */ 301 ++suspend_short_count;
130static inline void wakelocks_lru_add(struct wakelock *wl) {} 302
131static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} 303 if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) {
132static inline void wakelocks_gc(void) {} 304 suspend_backoff();
133#endif /* !CONFIG_PM_WAKELOCKS_GC */ 305 suspend_short_count = 0;
134
135static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
136 bool add_if_not_found)
137{
138 struct rb_node **node = &wakelocks_tree.rb_node;
139 struct rb_node *parent = *node;
140 struct wakelock *wl;
141
142 while (*node) {
143 int diff;
144
145 parent = *node;
146 wl = rb_entry(*node, struct wakelock, node);
147 diff = strncmp(name, wl->name, len);
148 if (diff == 0) {
149 if (wl->name[len])
150 diff = -1;
151 else
152 return wl;
153 } 306 }
154 if (diff < 0) 307 } else {
155 node = &(*node)->rb_left; 308 suspend_short_count = 0;
156 else
157 node = &(*node)->rb_right;
158 } 309 }
159 if (!add_if_not_found)
160 return ERR_PTR(-EINVAL);
161 310
162 if (wakelocks_limit_exceeded()) 311 if (current_event_num == entry_event_num) {
163 return ERR_PTR(-ENOSPC); 312 if (debug_mask & DEBUG_SUSPEND)
313 pr_info("suspend: pm_suspend returned with no event\n");
314 wake_lock_timeout(&unknown_wakeup, HZ / 2);
315 }
316}
317static DECLARE_WORK(suspend_work, suspend);
164 318
165 /* Not found, we have to add a new one. */ 319static void expire_wake_locks(unsigned long data)
166 wl = kzalloc(sizeof(*wl), GFP_KERNEL); 320{
167 if (!wl) 321 long has_lock;
168 return ERR_PTR(-ENOMEM); 322 unsigned long irqflags;
323 if (debug_mask & DEBUG_EXPIRE)
324 pr_info("expire_wake_locks: start\n");
325 spin_lock_irqsave(&list_lock, irqflags);
326 if (debug_mask & DEBUG_SUSPEND)
327 print_active_locks(WAKE_LOCK_SUSPEND);
328 has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND);
329 if (debug_mask & DEBUG_EXPIRE)
330 pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock);
331 if (has_lock == 0)
332 queue_work(suspend_work_queue, &suspend_work);
333 spin_unlock_irqrestore(&list_lock, irqflags);
334}
335static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0);
169 336
170 wl->name = kstrndup(name, len, GFP_KERNEL); 337static int power_suspend_late(struct device *dev)
171 if (!wl->name) { 338{
172 kfree(wl); 339 int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0;
173 return ERR_PTR(-ENOMEM); 340#ifdef CONFIG_WAKELOCK_STAT
174 } 341 wait_for_wakeup = !ret;
175 wl->ws.name = wl->name; 342#endif
176 wakeup_source_add(&wl->ws); 343 if (debug_mask & DEBUG_SUSPEND)
177 rb_link_node(&wl->node, parent, node); 344 pr_info("power_suspend_late return %d\n", ret);
178 rb_insert_color(&wl->node, &wakelocks_tree); 345 return ret;
179 wakelocks_lru_add(wl);
180 increment_wakelocks_number();
181 return wl;
182} 346}
183 347
184int pm_wake_lock(const char *buf) 348static struct dev_pm_ops power_driver_pm_ops = {
349 .suspend_noirq = power_suspend_late,
350};
351
352static struct platform_driver power_driver = {
353 .driver.name = "power",
354 .driver.pm = &power_driver_pm_ops,
355};
356static struct platform_device power_device = {
357 .name = "power",
358};
359
360void wake_lock_init(struct wake_lock *lock, int type, const char *name)
185{ 361{
186 const char *str = buf; 362 unsigned long irqflags = 0;
187 struct wakelock *wl; 363
188 u64 timeout_ns = 0; 364 if (name)
189 size_t len; 365 lock->name = name;
190 int ret = 0; 366 BUG_ON(!lock->name);
191 367
192 if (!capable(CAP_BLOCK_SUSPEND)) 368 if (debug_mask & DEBUG_WAKE_LOCK)
193 return -EPERM; 369 pr_info("wake_lock_init name=%s\n", lock->name);
194 370#ifdef CONFIG_WAKELOCK_STAT
195 while (*str && !isspace(*str)) 371 lock->stat.count = 0;
196 str++; 372 lock->stat.expire_count = 0;
197 373 lock->stat.wakeup_count = 0;
198 len = str - buf; 374 lock->stat.total_time = ktime_set(0, 0);
199 if (!len) 375 lock->stat.prevent_suspend_time = ktime_set(0, 0);
200 return -EINVAL; 376 lock->stat.max_time = ktime_set(0, 0);
201 377 lock->stat.last_time = ktime_set(0, 0);
202 if (*str && *str != '\n') { 378#endif
203 /* Find out if there's a valid timeout string appended. */ 379 lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED;
204 ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
205 if (ret)
206 return -EINVAL;
207 }
208 380
209 mutex_lock(&wakelocks_lock); 381 INIT_LIST_HEAD(&lock->link);
382 spin_lock_irqsave(&list_lock, irqflags);
383 list_add(&lock->link, &inactive_locks);
384 spin_unlock_irqrestore(&list_lock, irqflags);
385}
386EXPORT_SYMBOL(wake_lock_init);
210 387
211 wl = wakelock_lookup_add(buf, len, true); 388void wake_lock_destroy(struct wake_lock *lock)
212 if (IS_ERR(wl)) { 389{
213 ret = PTR_ERR(wl); 390 unsigned long irqflags;
214 goto out; 391 if (debug_mask & DEBUG_WAKE_LOCK)
392 pr_info("wake_lock_destroy name=%s\n", lock->name);
393 spin_lock_irqsave(&list_lock, irqflags);
394 lock->flags &= ~WAKE_LOCK_INITIALIZED;
395#ifdef CONFIG_WAKELOCK_STAT
396 if (lock->stat.count) {
397 deleted_wake_locks.stat.count += lock->stat.count;
398 deleted_wake_locks.stat.expire_count += lock->stat.expire_count;
399 deleted_wake_locks.stat.total_time =
400 ktime_add(deleted_wake_locks.stat.total_time,
401 lock->stat.total_time);
402 deleted_wake_locks.stat.prevent_suspend_time =
403 ktime_add(deleted_wake_locks.stat.prevent_suspend_time,
404 lock->stat.prevent_suspend_time);
405 deleted_wake_locks.stat.max_time =
406 ktime_add(deleted_wake_locks.stat.max_time,
407 lock->stat.max_time);
215 } 408 }
216 if (timeout_ns) { 409#endif
217 u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; 410 list_del(&lock->link);
411 spin_unlock_irqrestore(&list_lock, irqflags);
412}
413EXPORT_SYMBOL(wake_lock_destroy);
218 414
219 do_div(timeout_ms, NSEC_PER_MSEC); 415static void wake_lock_internal(
220 __pm_wakeup_event(&wl->ws, timeout_ms); 416 struct wake_lock *lock, long timeout, int has_timeout)
417{
418 int type;
419 unsigned long irqflags;
420 long expire_in;
421
422 spin_lock_irqsave(&list_lock, irqflags);
423 type = lock->flags & WAKE_LOCK_TYPE_MASK;
424 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
425 BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED));
426#ifdef CONFIG_WAKELOCK_STAT
427 if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) {
428 if (debug_mask & DEBUG_WAKEUP)
429 pr_info("wakeup wake lock: %s\n", lock->name);
430 wait_for_wakeup = 0;
431 lock->stat.wakeup_count++;
432 }
433 if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) &&
434 (long)(lock->expires - jiffies) <= 0) {
435 wake_unlock_stat_locked(lock, 0);
436 lock->stat.last_time = ktime_get();
437 }
438#endif
439 if (!(lock->flags & WAKE_LOCK_ACTIVE)) {
440 lock->flags |= WAKE_LOCK_ACTIVE;
441#ifdef CONFIG_WAKELOCK_STAT
442 lock->stat.last_time = ktime_get();
443#endif
444 }
445 list_del(&lock->link);
446 if (has_timeout) {
447 if (debug_mask & DEBUG_WAKE_LOCK)
448 pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n",
449 lock->name, type, timeout / HZ,
450 (timeout % HZ) * MSEC_PER_SEC / HZ);
451 lock->expires = jiffies + timeout;
452 lock->flags |= WAKE_LOCK_AUTO_EXPIRE;
453 list_add_tail(&lock->link, &active_wake_locks[type]);
221 } else { 454 } else {
222 __pm_stay_awake(&wl->ws); 455 if (debug_mask & DEBUG_WAKE_LOCK)
456 pr_info("wake_lock: %s, type %d\n", lock->name, type);
457 lock->expires = LONG_MAX;
458 lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE;
459 list_add(&lock->link, &active_wake_locks[type]);
460 }
461 if (type == WAKE_LOCK_SUSPEND) {
462 current_event_num++;
463#ifdef CONFIG_WAKELOCK_STAT
464 if (lock == &main_wake_lock)
465 update_sleep_wait_stats_locked(1);
466 else if (!wake_lock_active(&main_wake_lock))
467 update_sleep_wait_stats_locked(0);
468#endif
469 if (has_timeout)
470 expire_in = has_wake_lock_locked(type);
471 else
472 expire_in = -1;
473 if (expire_in > 0) {
474 if (debug_mask & DEBUG_EXPIRE)
475 pr_info("wake_lock: %s, start expire timer, "
476 "%ld\n", lock->name, expire_in);
477 mod_timer(&expire_timer, jiffies + expire_in);
478 } else {
479 if (del_timer(&expire_timer))
480 if (debug_mask & DEBUG_EXPIRE)
481 pr_info("wake_lock: %s, stop expire timer\n",
482 lock->name);
483 if (expire_in == 0)
484 queue_work(suspend_work_queue, &suspend_work);
485 }
223 } 486 }
487 spin_unlock_irqrestore(&list_lock, irqflags);
488}
224 489
225 wakelocks_lru_most_recent(wl); 490void wake_lock(struct wake_lock *lock)
491{
492 wake_lock_internal(lock, 0, 0);
493}
494EXPORT_SYMBOL(wake_lock);
226 495
227 out: 496void wake_lock_timeout(struct wake_lock *lock, long timeout)
228 mutex_unlock(&wakelocks_lock); 497{
229 return ret; 498 wake_lock_internal(lock, timeout, 1);
230} 499}
500EXPORT_SYMBOL(wake_lock_timeout);
231 501
232int pm_wake_unlock(const char *buf) 502void wake_unlock(struct wake_lock *lock)
233{ 503{
234 struct wakelock *wl; 504 int type;
235 size_t len; 505 unsigned long irqflags;
236 int ret = 0; 506 spin_lock_irqsave(&list_lock, irqflags);
507 type = lock->flags & WAKE_LOCK_TYPE_MASK;
508#ifdef CONFIG_WAKELOCK_STAT
509 wake_unlock_stat_locked(lock, 0);
510#endif
511 if (debug_mask & DEBUG_WAKE_LOCK)
512 pr_info("wake_unlock: %s\n", lock->name);
513 lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
514 list_del(&lock->link);
515 list_add(&lock->link, &inactive_locks);
516 if (type == WAKE_LOCK_SUSPEND) {
517 long has_lock = has_wake_lock_locked(type);
518 if (has_lock > 0) {
519 if (debug_mask & DEBUG_EXPIRE)
520 pr_info("wake_unlock: %s, start expire timer, "
521 "%ld\n", lock->name, has_lock);
522 mod_timer(&expire_timer, jiffies + has_lock);
523 } else {
524 if (del_timer(&expire_timer))
525 if (debug_mask & DEBUG_EXPIRE)
526 pr_info("wake_unlock: %s, stop expire "
527 "timer\n", lock->name);
528 if (has_lock == 0)
529 queue_work(suspend_work_queue, &suspend_work);
530 }
531 if (lock == &main_wake_lock) {
532 if (debug_mask & DEBUG_SUSPEND)
533 print_active_locks(WAKE_LOCK_SUSPEND);
534#ifdef CONFIG_WAKELOCK_STAT
535 update_sleep_wait_stats_locked(0);
536#endif
537 }
538 }
539 spin_unlock_irqrestore(&list_lock, irqflags);
540}
541EXPORT_SYMBOL(wake_unlock);
237 542
238 if (!capable(CAP_BLOCK_SUSPEND)) 543int wake_lock_active(struct wake_lock *lock)
239 return -EPERM; 544{
545 return !!(lock->flags & WAKE_LOCK_ACTIVE);
546}
547EXPORT_SYMBOL(wake_lock_active);
548
549static int wakelock_stats_open(struct inode *inode, struct file *file)
550{
551 return single_open(file, wakelock_stats_show, NULL);
552}
240 553
241 len = strlen(buf); 554static const struct file_operations wakelock_stats_fops = {
242 if (!len) 555 .owner = THIS_MODULE,
243 return -EINVAL; 556 .open = wakelock_stats_open,
557 .read = seq_read,
558 .llseek = seq_lseek,
559 .release = single_release,
560};
244 561
245 if (buf[len-1] == '\n') 562static int __init wakelocks_init(void)
246 len--; 563{
564 int ret;
565 int i;
247 566
248 if (!len) 567 for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++)
249 return -EINVAL; 568 INIT_LIST_HEAD(&active_wake_locks[i]);
250 569
251 mutex_lock(&wakelocks_lock); 570#ifdef CONFIG_WAKELOCK_STAT
571 wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND,
572 "deleted_wake_locks");
573#endif
574 wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main");
575 wake_lock(&main_wake_lock);
576 wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups");
577 wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND,
578 "suspend_backoff");
579
580 ret = platform_device_register(&power_device);
581 if (ret) {
582 pr_err("wakelocks_init: platform_device_register failed\n");
583 goto err_platform_device_register;
584 }
585 ret = platform_driver_register(&power_driver);
586 if (ret) {
587 pr_err("wakelocks_init: platform_driver_register failed\n");
588 goto err_platform_driver_register;
589 }
252 590
253 wl = wakelock_lookup_add(buf, len, false); 591 suspend_work_queue = create_singlethread_workqueue("suspend");
254 if (IS_ERR(wl)) { 592 if (suspend_work_queue == NULL) {
255 ret = PTR_ERR(wl); 593 ret = -ENOMEM;
256 goto out; 594 goto err_suspend_work_queue;
257 } 595 }
258 __pm_relax(&wl->ws);
259 596
260 wakelocks_lru_most_recent(wl); 597#ifdef CONFIG_WAKELOCK_STAT
261 wakelocks_gc(); 598 proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops);
599#endif
262 600
263 out: 601 return 0;
264 mutex_unlock(&wakelocks_lock); 602
603err_suspend_work_queue:
604 platform_driver_unregister(&power_driver);
605err_platform_driver_register:
606 platform_device_unregister(&power_device);
607err_platform_device_register:
608 wake_lock_destroy(&suspend_backoff_lock);
609 wake_lock_destroy(&unknown_wakeup);
610 wake_lock_destroy(&main_wake_lock);
611#ifdef CONFIG_WAKELOCK_STAT
612 wake_lock_destroy(&deleted_wake_locks);
613#endif
265 return ret; 614 return ret;
266} 615}
616
617static void __exit wakelocks_exit(void)
618{
619#ifdef CONFIG_WAKELOCK_STAT
620 remove_proc_entry("wakelocks", NULL);
621#endif
622 destroy_workqueue(suspend_work_queue);
623 platform_driver_unregister(&power_driver);
624 platform_device_unregister(&power_device);
625 wake_lock_destroy(&suspend_backoff_lock);
626 wake_lock_destroy(&unknown_wakeup);
627 wake_lock_destroy(&main_wake_lock);
628#ifdef CONFIG_WAKELOCK_STAT
629 wake_lock_destroy(&deleted_wake_locks);
630#endif
631}
632
633core_initcall(wakelocks_init);
634module_exit(wakelocks_exit);
diff --git a/kernel/printk.c b/kernel/printk.c
index 357f714ddd4..1baace7d867 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,13 +41,9 @@
41#include <linux/cpu.h> 41#include <linux/cpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h>
45 44
46#include <asm/uaccess.h> 45#include <asm/uaccess.h>
47 46
48#define CREATE_TRACE_POINTS
49#include <trace/events/printk.h>
50
51/* 47/*
52 * Architectures can override it: 48 * Architectures can override it:
53 */ 49 */
@@ -55,6 +51,12 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
55{ 51{
56} 52}
57 53
54#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
55
56#ifdef CONFIG_DEBUG_LL
57extern void printascii(char *);
58#endif
59
58/* printk's without a loglevel use this.. */ 60/* printk's without a loglevel use this.. */
59#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 61#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
60 62
@@ -87,12 +89,6 @@ static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 89struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
89 91
90#ifdef CONFIG_LOCKDEP
91static struct lockdep_map console_lock_dep_map = {
92 .name = "console_lock"
93};
94#endif
95
96/* 92/*
97 * This is used for debugging the mess that is the VT code by 93 * This is used for debugging the mess that is the VT code by
98 * keeping track if we have the console semaphore held. It's 94 * keeping track if we have the console semaphore held. It's
@@ -104,6 +100,24 @@ static struct lockdep_map console_lock_dep_map = {
104static int console_locked, console_suspended; 100static int console_locked, console_suspended;
105 101
106/* 102/*
103 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
104 * It is also used in interesting ways to provide interlocking in
105 * console_unlock();.
106 */
107static DEFINE_SPINLOCK(logbuf_lock);
108
109#define LOG_BUF_MASK (log_buf_len-1)
110#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
111
112/*
113 * The indices into log_buf are not constrained to log_buf_len - they
114 * must be masked before subscripting
115 */
116static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
117static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
118static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
119
120/*
107 * If exclusive_console is non-NULL then only this console is to be printed to. 121 * If exclusive_console is non-NULL then only this console is to be printed to.
108 */ 122 */
109static struct console *exclusive_console; 123static struct console *exclusive_console;
@@ -132,537 +146,13 @@ EXPORT_SYMBOL(console_set_on_cmdline);
132/* Flag: console code may call schedule() */ 146/* Flag: console code may call schedule() */
133static int console_may_schedule; 147static int console_may_schedule;
134 148
135/*
136 * The printk log buffer consists of a chain of concatenated variable
137 * length records. Every record starts with a record header, containing
138 * the overall length of the record.
139 *
140 * The heads to the first and last entry in the buffer, as well as the
141 * sequence numbers of these both entries are maintained when messages
142 * are stored..
143 *
144 * If the heads indicate available messages, the length in the header
145 * tells the start next message. A length == 0 for the next message
146 * indicates a wrap-around to the beginning of the buffer.
147 *
148 * Every record carries the monotonic timestamp in microseconds, as well as
149 * the standard userspace syslog level and syslog facility. The usual
150 * kernel messages use LOG_KERN; userspace-injected messages always carry
151 * a matching syslog facility, by default LOG_USER. The origin of every
152 * message can be reliably determined that way.
153 *
154 * The human readable log message directly follows the message header. The
155 * length of the message text is stored in the header, the stored message
156 * is not terminated.
157 *
158 * Optionally, a message can carry a dictionary of properties (key/value pairs),
159 * to provide userspace with a machine-readable message context.
160 *
161 * Examples for well-defined, commonly used property names are:
162 * DEVICE=b12:8 device identifier
163 * b12:8 block dev_t
164 * c127:3 char dev_t
165 * n8 netdev ifindex
166 * +sound:card0 subsystem:devname
167 * SUBSYSTEM=pci driver-core subsystem name
168 *
169 * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
170 * follows directly after a '=' character. Every property is terminated by
171 * a '\0' character. The last property is not terminated.
172 *
173 * Example of a message structure:
174 * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
175 * 0008 34 00 record is 52 bytes long
176 * 000a 0b 00 text is 11 bytes long
177 * 000c 1f 00 dictionary is 23 bytes long
178 * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
179 * 0010 69 74 27 73 20 61 20 6c "it's a l"
180 * 69 6e 65 "ine"
181 * 001b 44 45 56 49 43 "DEVIC"
182 * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
183 * 52 49 56 45 52 3d 62 75 "RIVER=bu"
184 * 67 "g"
185 * 0032 00 00 00 padding to next message header
186 *
187 * The 'struct log' buffer header must never be directly exported to
188 * userspace, it is a kernel-private implementation detail that might
189 * need to be changed in the future, when the requirements change.
190 *
191 * /dev/kmsg exports the structured data in the following line format:
192 * "level,sequnum,timestamp;<message text>\n"
193 *
194 * The optional key/value pairs are attached as continuation lines starting
195 * with a space character and terminated by a newline. All possible
196 * non-prinatable characters are escaped in the "\xff" notation.
197 *
198 * Users of the export format should ignore possible additional values
199 * separated by ',', and find the message after the ';' character.
200 */
201
202enum log_flags {
203 LOG_NOCONS = 1, /* already flushed, do not print to console */
204 LOG_NEWLINE = 2, /* text ended with a newline */
205 LOG_PREFIX = 4, /* text started with a prefix */
206 LOG_CONT = 8, /* text is a fragment of a continuation line */
207};
208
209struct log {
210 u64 ts_nsec; /* timestamp in nanoseconds */
211 u16 len; /* length of entire record */
212 u16 text_len; /* length of text buffer */
213 u16 dict_len; /* length of dictionary buffer */
214 u8 facility; /* syslog facility */
215 u8 flags:5; /* internal record flags */
216 u8 level:3; /* syslog level */
217};
218
219/*
220 * The logbuf_lock protects kmsg buffer, indices, counters. It is also
221 * used in interesting ways to provide interlocking in console_unlock();
222 */
223static DEFINE_RAW_SPINLOCK(logbuf_lock);
224
225#ifdef CONFIG_PRINTK 149#ifdef CONFIG_PRINTK
226/* the next printk record to read by syslog(READ) or /proc/kmsg */
227static u64 syslog_seq;
228static u32 syslog_idx;
229static enum log_flags syslog_prev;
230static size_t syslog_partial;
231
232/* index and sequence number of the first record stored in the buffer */
233static u64 log_first_seq;
234static u32 log_first_idx;
235
236/* index and sequence number of the next record to store in the buffer */
237static u64 log_next_seq;
238static u32 log_next_idx;
239
240/* the next printk record to write to the console */
241static u64 console_seq;
242static u32 console_idx;
243static enum log_flags console_prev;
244
245/* the next printk record to read after the last 'clear' command */
246static u64 clear_seq;
247static u32 clear_idx;
248
249#define PREFIX_MAX 32
250#define LOG_LINE_MAX 1024 - PREFIX_MAX
251
252/* record buffer */
253#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
254#define LOG_ALIGN 4
255#else
256#define LOG_ALIGN __alignof__(struct log)
257#endif
258#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
259static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
260static char *log_buf = __log_buf;
261static u32 log_buf_len = __LOG_BUF_LEN;
262
263/* cpu currently holding logbuf_lock */
264static volatile unsigned int logbuf_cpu = UINT_MAX;
265
266/* human readable text of the record */
267static char *log_text(const struct log *msg)
268{
269 return (char *)msg + sizeof(struct log);
270}
271
272/* optional key/value pair dictionary attached to the record */
273static char *log_dict(const struct log *msg)
274{
275 return (char *)msg + sizeof(struct log) + msg->text_len;
276}
277
278/* get record by index; idx must point to valid msg */
279static struct log *log_from_idx(u32 idx)
280{
281 struct log *msg = (struct log *)(log_buf + idx);
282
283 /*
284 * A length == 0 record is the end of buffer marker. Wrap around and
285 * read the message at the start of the buffer.
286 */
287 if (!msg->len)
288 return (struct log *)log_buf;
289 return msg;
290}
291
292/* get next record; idx must point to valid msg */
293static u32 log_next(u32 idx)
294{
295 struct log *msg = (struct log *)(log_buf + idx);
296
297 /* length == 0 indicates the end of the buffer; wrap */
298 /*
299 * A length == 0 record is the end of buffer marker. Wrap around and
300 * read the message at the start of the buffer as *this* one, and
301 * return the one after that.
302 */
303 if (!msg->len) {
304 msg = (struct log *)log_buf;
305 return msg->len;
306 }
307 return idx + msg->len;
308}
309
310/* insert record into the buffer, discard old ones, update heads */
311static void log_store(int facility, int level,
312 enum log_flags flags, u64 ts_nsec,
313 const char *dict, u16 dict_len,
314 const char *text, u16 text_len)
315{
316 struct log *msg;
317 u32 size, pad_len;
318
319 /* number of '\0' padding bytes to next message */
320 size = sizeof(struct log) + text_len + dict_len;
321 pad_len = (-size) & (LOG_ALIGN - 1);
322 size += pad_len;
323
324 while (log_first_seq < log_next_seq) {
325 u32 free;
326
327 if (log_next_idx > log_first_idx)
328 free = max(log_buf_len - log_next_idx, log_first_idx);
329 else
330 free = log_first_idx - log_next_idx;
331
332 if (free > size + sizeof(struct log))
333 break;
334
335 /* drop old messages until we have enough contiuous space */
336 log_first_idx = log_next(log_first_idx);
337 log_first_seq++;
338 }
339
340 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
341 /*
342 * This message + an additional empty header does not fit
343 * at the end of the buffer. Add an empty header with len == 0
344 * to signify a wrap around.
345 */
346 memset(log_buf + log_next_idx, 0, sizeof(struct log));
347 log_next_idx = 0;
348 }
349
350 /* fill message */
351 msg = (struct log *)(log_buf + log_next_idx);
352 memcpy(log_text(msg), text, text_len);
353 msg->text_len = text_len;
354 memcpy(log_dict(msg), dict, dict_len);
355 msg->dict_len = dict_len;
356 msg->facility = facility;
357 msg->level = level & 7;
358 msg->flags = flags & 0x1f;
359 if (ts_nsec > 0)
360 msg->ts_nsec = ts_nsec;
361 else
362 msg->ts_nsec = local_clock();
363 memset(log_dict(msg) + dict_len, 0, pad_len);
364 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
365
366 /* insert message */
367 log_next_idx += msg->len;
368 log_next_seq++;
369}
370
371/* /dev/kmsg - userspace message inject/listen interface */
372struct devkmsg_user {
373 u64 seq;
374 u32 idx;
375 enum log_flags prev;
376 struct mutex lock;
377 char buf[8192];
378};
379
380static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
381 unsigned long count, loff_t pos)
382{
383 char *buf, *line;
384 int i;
385 int level = default_message_loglevel;
386 int facility = 1; /* LOG_USER */
387 size_t len = iov_length(iv, count);
388 ssize_t ret = len;
389
390 if (len > LOG_LINE_MAX)
391 return -EINVAL;
392 buf = kmalloc(len+1, GFP_KERNEL);
393 if (buf == NULL)
394 return -ENOMEM;
395
396 line = buf;
397 for (i = 0; i < count; i++) {
398 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
399 ret = -EFAULT;
400 goto out;
401 }
402 line += iv[i].iov_len;
403 }
404
405 /*
406 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
407 * the decimal value represents 32bit, the lower 3 bit are the log
408 * level, the rest are the log facility.
409 *
410 * If no prefix or no userspace facility is specified, we
411 * enforce LOG_USER, to be able to reliably distinguish
412 * kernel-generated messages from userspace-injected ones.
413 */
414 line = buf;
415 if (line[0] == '<') {
416 char *endp = NULL;
417
418 i = simple_strtoul(line+1, &endp, 10);
419 if (endp && endp[0] == '>') {
420 level = i & 7;
421 if (i >> 3)
422 facility = i >> 3;
423 endp++;
424 len -= endp - line;
425 line = endp;
426 }
427 }
428 line[len] = '\0';
429
430 printk_emit(facility, level, NULL, 0, "%s", line);
431out:
432 kfree(buf);
433 return ret;
434}
435
436static ssize_t devkmsg_read(struct file *file, char __user *buf,
437 size_t count, loff_t *ppos)
438{
439 struct devkmsg_user *user = file->private_data;
440 struct log *msg;
441 u64 ts_usec;
442 size_t i;
443 char cont = '-';
444 size_t len;
445 ssize_t ret;
446
447 if (!user)
448 return -EBADF;
449
450 ret = mutex_lock_interruptible(&user->lock);
451 if (ret)
452 return ret;
453 raw_spin_lock_irq(&logbuf_lock);
454 while (user->seq == log_next_seq) {
455 if (file->f_flags & O_NONBLOCK) {
456 ret = -EAGAIN;
457 raw_spin_unlock_irq(&logbuf_lock);
458 goto out;
459 }
460
461 raw_spin_unlock_irq(&logbuf_lock);
462 ret = wait_event_interruptible(log_wait,
463 user->seq != log_next_seq);
464 if (ret)
465 goto out;
466 raw_spin_lock_irq(&logbuf_lock);
467 }
468
469 if (user->seq < log_first_seq) {
470 /* our last seen message is gone, return error and reset */
471 user->idx = log_first_idx;
472 user->seq = log_first_seq;
473 ret = -EPIPE;
474 raw_spin_unlock_irq(&logbuf_lock);
475 goto out;
476 }
477
478 msg = log_from_idx(user->idx);
479 ts_usec = msg->ts_nsec;
480 do_div(ts_usec, 1000);
481
482 /*
483 * If we couldn't merge continuation line fragments during the print,
484 * export the stored flags to allow an optional external merge of the
485 * records. Merging the records isn't always neccessarily correct, like
486 * when we hit a race during printing. In most cases though, it produces
487 * better readable output. 'c' in the record flags mark the first
488 * fragment of a line, '+' the following.
489 */
490 if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
491 cont = 'c';
492 else if ((msg->flags & LOG_CONT) ||
493 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
494 cont = '+';
495
496 len = sprintf(user->buf, "%u,%llu,%llu,%c;",
497 (msg->facility << 3) | msg->level,
498 user->seq, ts_usec, cont);
499 user->prev = msg->flags;
500
501 /* escape non-printable characters */
502 for (i = 0; i < msg->text_len; i++) {
503 unsigned char c = log_text(msg)[i];
504
505 if (c < ' ' || c >= 127 || c == '\\')
506 len += sprintf(user->buf + len, "\\x%02x", c);
507 else
508 user->buf[len++] = c;
509 }
510 user->buf[len++] = '\n';
511
512 if (msg->dict_len) {
513 bool line = true;
514
515 for (i = 0; i < msg->dict_len; i++) {
516 unsigned char c = log_dict(msg)[i];
517
518 if (line) {
519 user->buf[len++] = ' ';
520 line = false;
521 }
522
523 if (c == '\0') {
524 user->buf[len++] = '\n';
525 line = true;
526 continue;
527 }
528
529 if (c < ' ' || c >= 127 || c == '\\') {
530 len += sprintf(user->buf + len, "\\x%02x", c);
531 continue;
532 }
533
534 user->buf[len++] = c;
535 }
536 user->buf[len++] = '\n';
537 }
538
539 user->idx = log_next(user->idx);
540 user->seq++;
541 raw_spin_unlock_irq(&logbuf_lock);
542
543 if (len > count) {
544 ret = -EINVAL;
545 goto out;
546 }
547
548 if (copy_to_user(buf, user->buf, len)) {
549 ret = -EFAULT;
550 goto out;
551 }
552 ret = len;
553out:
554 mutex_unlock(&user->lock);
555 return ret;
556}
557
558static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
559{
560 struct devkmsg_user *user = file->private_data;
561 loff_t ret = 0;
562
563 if (!user)
564 return -EBADF;
565 if (offset)
566 return -ESPIPE;
567
568 raw_spin_lock_irq(&logbuf_lock);
569 switch (whence) {
570 case SEEK_SET:
571 /* the first record */
572 user->idx = log_first_idx;
573 user->seq = log_first_seq;
574 break;
575 case SEEK_DATA:
576 /*
577 * The first record after the last SYSLOG_ACTION_CLEAR,
578 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
579 * changes no global state, and does not clear anything.
580 */
581 user->idx = clear_idx;
582 user->seq = clear_seq;
583 break;
584 case SEEK_END:
585 /* after the last record */
586 user->idx = log_next_idx;
587 user->seq = log_next_seq;
588 break;
589 default:
590 ret = -EINVAL;
591 }
592 raw_spin_unlock_irq(&logbuf_lock);
593 return ret;
594}
595
596static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
597{
598 struct devkmsg_user *user = file->private_data;
599 int ret = 0;
600
601 if (!user)
602 return POLLERR|POLLNVAL;
603
604 poll_wait(file, &log_wait, wait);
605
606 raw_spin_lock_irq(&logbuf_lock);
607 if (user->seq < log_next_seq) {
608 /* return error when data has vanished underneath us */
609 if (user->seq < log_first_seq)
610 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
611 ret = POLLIN|POLLRDNORM;
612 }
613 raw_spin_unlock_irq(&logbuf_lock);
614
615 return ret;
616}
617
618static int devkmsg_open(struct inode *inode, struct file *file)
619{
620 struct devkmsg_user *user;
621 int err;
622
623 /* write-only does not need any file context */
624 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
625 return 0;
626
627 err = security_syslog(SYSLOG_ACTION_READ_ALL);
628 if (err)
629 return err;
630
631 user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
632 if (!user)
633 return -ENOMEM;
634
635 mutex_init(&user->lock);
636
637 raw_spin_lock_irq(&logbuf_lock);
638 user->idx = log_first_idx;
639 user->seq = log_first_seq;
640 raw_spin_unlock_irq(&logbuf_lock);
641
642 file->private_data = user;
643 return 0;
644}
645
646static int devkmsg_release(struct inode *inode, struct file *file)
647{
648 struct devkmsg_user *user = file->private_data;
649
650 if (!user)
651 return 0;
652 150
653 mutex_destroy(&user->lock); 151static char __log_buf[__LOG_BUF_LEN];
654 kfree(user); 152static char *log_buf = __log_buf;
655 return 0; 153static int log_buf_len = __LOG_BUF_LEN;
656} 154static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
657 155static int saved_console_loglevel = -1;
658const struct file_operations kmsg_fops = {
659 .open = devkmsg_open,
660 .read = devkmsg_read,
661 .aio_write = devkmsg_writev,
662 .llseek = devkmsg_llseek,
663 .poll = devkmsg_poll,
664 .release = devkmsg_release,
665};
666 156
667#ifdef CONFIG_KEXEC 157#ifdef CONFIG_KEXEC
668/* 158/*
@@ -676,18 +166,9 @@ const struct file_operations kmsg_fops = {
676void log_buf_kexec_setup(void) 166void log_buf_kexec_setup(void)
677{ 167{
678 VMCOREINFO_SYMBOL(log_buf); 168 VMCOREINFO_SYMBOL(log_buf);
169 VMCOREINFO_SYMBOL(log_end);
679 VMCOREINFO_SYMBOL(log_buf_len); 170 VMCOREINFO_SYMBOL(log_buf_len);
680 VMCOREINFO_SYMBOL(log_first_idx); 171 VMCOREINFO_SYMBOL(logged_chars);
681 VMCOREINFO_SYMBOL(log_next_idx);
682 /*
683 * Export struct log size and field offsets. User space tools can
684 * parse it and detect any changes to structure down the line.
685 */
686 VMCOREINFO_STRUCT_SIZE(log);
687 VMCOREINFO_OFFSET(log, ts_nsec);
688 VMCOREINFO_OFFSET(log, len);
689 VMCOREINFO_OFFSET(log, text_len);
690 VMCOREINFO_OFFSET(log, dict_len);
691} 172}
692#endif 173#endif
693 174
@@ -711,6 +192,7 @@ early_param("log_buf_len", log_buf_len_setup);
711void __init setup_log_buf(int early) 192void __init setup_log_buf(int early)
712{ 193{
713 unsigned long flags; 194 unsigned long flags;
195 unsigned start, dest_idx, offset;
714 char *new_log_buf; 196 char *new_log_buf;
715 int free; 197 int free;
716 198
@@ -721,7 +203,7 @@ void __init setup_log_buf(int early)
721 unsigned long mem; 203 unsigned long mem;
722 204
723 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 205 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
724 if (!mem) 206 if (mem == MEMBLOCK_ERROR)
725 return; 207 return;
726 new_log_buf = __va(mem); 208 new_log_buf = __va(mem);
727 } else { 209 } else {
@@ -734,34 +216,31 @@ void __init setup_log_buf(int early)
734 return; 216 return;
735 } 217 }
736 218
737 raw_spin_lock_irqsave(&logbuf_lock, flags); 219 spin_lock_irqsave(&logbuf_lock, flags);
738 log_buf_len = new_log_buf_len; 220 log_buf_len = new_log_buf_len;
739 log_buf = new_log_buf; 221 log_buf = new_log_buf;
740 new_log_buf_len = 0; 222 new_log_buf_len = 0;
741 free = __LOG_BUF_LEN - log_next_idx; 223 free = __LOG_BUF_LEN - log_end;
742 memcpy(log_buf, __log_buf, __LOG_BUF_LEN); 224
743 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 225 offset = start = min(con_start, log_start);
226 dest_idx = 0;
227 while (start != log_end) {
228 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
229
230 log_buf[dest_idx] = __log_buf[log_idx_mask];
231 start++;
232 dest_idx++;
233 }
234 log_start -= offset;
235 con_start -= offset;
236 log_end -= offset;
237 spin_unlock_irqrestore(&logbuf_lock, flags);
744 238
745 pr_info("log_buf_len: %d\n", log_buf_len); 239 pr_info("log_buf_len: %d\n", log_buf_len);
746 pr_info("early log buf free: %d(%d%%)\n", 240 pr_info("early log buf free: %d(%d%%)\n",
747 free, (free * 100) / __LOG_BUF_LEN); 241 free, (free * 100) / __LOG_BUF_LEN);
748} 242}
749 243
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
765#ifdef CONFIG_BOOT_PRINTK_DELAY 244#ifdef CONFIG_BOOT_PRINTK_DELAY
766 245
767static int boot_delay; /* msecs delay after each printk during bootup */ 246static int boot_delay; /* msecs delay after each printk during bootup */
@@ -785,15 +264,13 @@ static int __init boot_delay_setup(char *str)
785} 264}
786__setup("boot_delay=", boot_delay_setup); 265__setup("boot_delay=", boot_delay_setup);
787 266
788static void boot_delay_msec(int level) 267static void boot_delay_msec(void)
789{ 268{
790 unsigned long long k; 269 unsigned long long k;
791 unsigned long timeout; 270 unsigned long timeout;
792 271
793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) 272 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
795 return; 273 return;
796 }
797 274
798 k = (unsigned long long)loops_per_msec * boot_delay; 275 k = (unsigned long long)loops_per_msec * boot_delay;
799 276
@@ -812,11 +289,58 @@ static void boot_delay_msec(int level)
812 } 289 }
813} 290}
814#else 291#else
815static inline void boot_delay_msec(int level) 292static inline void boot_delay_msec(void)
816{ 293{
817} 294}
818#endif 295#endif
819 296
297/*
298 * Return the number of unread characters in the log buffer.
299 */
300static int log_buf_get_len(void)
301{
302 return logged_chars;
303}
304
305/*
306 * Clears the ring-buffer
307 */
308void log_buf_clear(void)
309{
310 logged_chars = 0;
311}
312
313/*
314 * Copy a range of characters from the log buffer.
315 */
316int log_buf_copy(char *dest, int idx, int len)
317{
318 int ret, max;
319 bool took_lock = false;
320
321 if (!oops_in_progress) {
322 spin_lock_irq(&logbuf_lock);
323 took_lock = true;
324 }
325
326 max = log_buf_get_len();
327 if (idx < 0 || idx >= max) {
328 ret = -1;
329 } else {
330 if (len > max - idx)
331 len = max - idx;
332 ret = len;
333 idx += (log_end - max);
334 while (len-- > 0)
335 dest[len] = LOG_BUF(idx + len);
336 }
337
338 if (took_lock)
339 spin_unlock_irq(&logbuf_lock);
340
341 return ret;
342}
343
820#ifdef CONFIG_SECURITY_DMESG_RESTRICT 344#ifdef CONFIG_SECURITY_DMESG_RESTRICT
821int dmesg_restrict = 1; 345int dmesg_restrict = 1;
822#else 346#else
@@ -856,275 +380,11 @@ static int check_syslog_permissions(int type, bool from_file)
856 return 0; 380 return 0;
857} 381}
858 382
859#if defined(CONFIG_PRINTK_TIME)
860static bool printk_time = 1;
861#else
862static bool printk_time;
863#endif
864module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
865
866static size_t print_time(u64 ts, char *buf)
867{
868 unsigned long rem_nsec;
869
870 if (!printk_time)
871 return 0;
872
873 rem_nsec = do_div(ts, 1000000000);
874
875 if (!buf)
876 return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
877
878 return sprintf(buf, "[%5lu.%06lu] ",
879 (unsigned long)ts, rem_nsec / 1000);
880}
881
882static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
883{
884 size_t len = 0;
885 unsigned int prefix = (msg->facility << 3) | msg->level;
886
887 if (syslog) {
888 if (buf) {
889 len += sprintf(buf, "<%u>", prefix);
890 } else {
891 len += 3;
892 if (prefix > 999)
893 len += 3;
894 else if (prefix > 99)
895 len += 2;
896 else if (prefix > 9)
897 len++;
898 }
899 }
900
901 len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
902 return len;
903}
904
905static size_t msg_print_text(const struct log *msg, enum log_flags prev,
906 bool syslog, char *buf, size_t size)
907{
908 const char *text = log_text(msg);
909 size_t text_size = msg->text_len;
910 bool prefix = true;
911 bool newline = true;
912 size_t len = 0;
913
914 if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
915 prefix = false;
916
917 if (msg->flags & LOG_CONT) {
918 if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
919 prefix = false;
920
921 if (!(msg->flags & LOG_NEWLINE))
922 newline = false;
923 }
924
925 do {
926 const char *next = memchr(text, '\n', text_size);
927 size_t text_len;
928
929 if (next) {
930 text_len = next - text;
931 next++;
932 text_size -= next - text;
933 } else {
934 text_len = text_size;
935 }
936
937 if (buf) {
938 if (print_prefix(msg, syslog, NULL) +
939 text_len + 1 >= size - len)
940 break;
941
942 if (prefix)
943 len += print_prefix(msg, syslog, buf + len);
944 memcpy(buf + len, text, text_len);
945 len += text_len;
946 if (next || newline)
947 buf[len++] = '\n';
948 } else {
949 /* SYSLOG_ACTION_* buffer size only calculation */
950 if (prefix)
951 len += print_prefix(msg, syslog, NULL);
952 len += text_len;
953 if (next || newline)
954 len++;
955 }
956
957 prefix = true;
958 text = next;
959 } while (text);
960
961 return len;
962}
963
964static int syslog_print(char __user *buf, int size)
965{
966 char *text;
967 struct log *msg;
968 int len = 0;
969
970 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
971 if (!text)
972 return -ENOMEM;
973
974 while (size > 0) {
975 size_t n;
976 size_t skip;
977
978 raw_spin_lock_irq(&logbuf_lock);
979 if (syslog_seq < log_first_seq) {
980 /* messages are gone, move to first one */
981 syslog_seq = log_first_seq;
982 syslog_idx = log_first_idx;
983 syslog_prev = 0;
984 syslog_partial = 0;
985 }
986 if (syslog_seq == log_next_seq) {
987 raw_spin_unlock_irq(&logbuf_lock);
988 break;
989 }
990
991 skip = syslog_partial;
992 msg = log_from_idx(syslog_idx);
993 n = msg_print_text(msg, syslog_prev, true, text,
994 LOG_LINE_MAX + PREFIX_MAX);
995 if (n - syslog_partial <= size) {
996 /* message fits into buffer, move forward */
997 syslog_idx = log_next(syslog_idx);
998 syslog_seq++;
999 syslog_prev = msg->flags;
1000 n -= syslog_partial;
1001 syslog_partial = 0;
1002 } else if (!len){
1003 /* partial read(), remember position */
1004 n = size;
1005 syslog_partial += n;
1006 } else
1007 n = 0;
1008 raw_spin_unlock_irq(&logbuf_lock);
1009
1010 if (!n)
1011 break;
1012
1013 if (copy_to_user(buf, text + skip, n)) {
1014 if (!len)
1015 len = -EFAULT;
1016 break;
1017 }
1018
1019 len += n;
1020 size -= n;
1021 buf += n;
1022 }
1023
1024 kfree(text);
1025 return len;
1026}
1027
1028static int syslog_print_all(char __user *buf, int size, bool clear)
1029{
1030 char *text;
1031 int len = 0;
1032
1033 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
1034 if (!text)
1035 return -ENOMEM;
1036
1037 raw_spin_lock_irq(&logbuf_lock);
1038 if (buf) {
1039 u64 next_seq;
1040 u64 seq;
1041 u32 idx;
1042 enum log_flags prev;
1043
1044 if (clear_seq < log_first_seq) {
1045 /* messages are gone, move to first available one */
1046 clear_seq = log_first_seq;
1047 clear_idx = log_first_idx;
1048 }
1049
1050 /*
1051 * Find first record that fits, including all following records,
1052 * into the user-provided buffer for this dump.
1053 */
1054 seq = clear_seq;
1055 idx = clear_idx;
1056 prev = 0;
1057 while (seq < log_next_seq) {
1058 struct log *msg = log_from_idx(idx);
1059
1060 len += msg_print_text(msg, prev, true, NULL, 0);
1061 prev = msg->flags;
1062 idx = log_next(idx);
1063 seq++;
1064 }
1065
1066 /* move first record forward until length fits into the buffer */
1067 seq = clear_seq;
1068 idx = clear_idx;
1069 prev = 0;
1070 while (len > size && seq < log_next_seq) {
1071 struct log *msg = log_from_idx(idx);
1072
1073 len -= msg_print_text(msg, prev, true, NULL, 0);
1074 prev = msg->flags;
1075 idx = log_next(idx);
1076 seq++;
1077 }
1078
1079 /* last message fitting into this dump */
1080 next_seq = log_next_seq;
1081
1082 len = 0;
1083 prev = 0;
1084 while (len >= 0 && seq < next_seq) {
1085 struct log *msg = log_from_idx(idx);
1086 int textlen;
1087
1088 textlen = msg_print_text(msg, prev, true, text,
1089 LOG_LINE_MAX + PREFIX_MAX);
1090 if (textlen < 0) {
1091 len = textlen;
1092 break;
1093 }
1094 idx = log_next(idx);
1095 seq++;
1096 prev = msg->flags;
1097
1098 raw_spin_unlock_irq(&logbuf_lock);
1099 if (copy_to_user(buf + len, text, textlen))
1100 len = -EFAULT;
1101 else
1102 len += textlen;
1103 raw_spin_lock_irq(&logbuf_lock);
1104
1105 if (seq < log_first_seq) {
1106 /* messages are gone, move to next one */
1107 seq = log_first_seq;
1108 idx = log_first_idx;
1109 prev = 0;
1110 }
1111 }
1112 }
1113
1114 if (clear) {
1115 clear_seq = log_next_seq;
1116 clear_idx = log_next_idx;
1117 }
1118 raw_spin_unlock_irq(&logbuf_lock);
1119
1120 kfree(text);
1121 return len;
1122}
1123
1124int do_syslog(int type, char __user *buf, int len, bool from_file) 383int do_syslog(int type, char __user *buf, int len, bool from_file)
1125{ 384{
1126 bool clear = false; 385 unsigned i, j, limit, count;
1127 static int saved_console_loglevel = -1; 386 int do_clear = 0;
387 char c;
1128 int error; 388 int error;
1129 389
1130 error = check_syslog_permissions(type, from_file); 390 error = check_syslog_permissions(type, from_file);
@@ -1152,14 +412,28 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1152 goto out; 412 goto out;
1153 } 413 }
1154 error = wait_event_interruptible(log_wait, 414 error = wait_event_interruptible(log_wait,
1155 syslog_seq != log_next_seq); 415 (log_start - log_end));
1156 if (error) 416 if (error)
1157 goto out; 417 goto out;
1158 error = syslog_print(buf, len); 418 i = 0;
419 spin_lock_irq(&logbuf_lock);
420 while (!error && (log_start != log_end) && i < len) {
421 c = LOG_BUF(log_start);
422 log_start++;
423 spin_unlock_irq(&logbuf_lock);
424 error = __put_user(c,buf);
425 buf++;
426 i++;
427 cond_resched();
428 spin_lock_irq(&logbuf_lock);
429 }
430 spin_unlock_irq(&logbuf_lock);
431 if (!error)
432 error = i;
1159 break; 433 break;
1160 /* Read/clear last kernel messages */ 434 /* Read/clear last kernel messages */
1161 case SYSLOG_ACTION_READ_CLEAR: 435 case SYSLOG_ACTION_READ_CLEAR:
1162 clear = true; 436 do_clear = 1;
1163 /* FALL THRU */ 437 /* FALL THRU */
1164 /* Read last kernel messages */ 438 /* Read last kernel messages */
1165 case SYSLOG_ACTION_READ_ALL: 439 case SYSLOG_ACTION_READ_ALL:
@@ -1173,11 +447,51 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1173 error = -EFAULT; 447 error = -EFAULT;
1174 goto out; 448 goto out;
1175 } 449 }
1176 error = syslog_print_all(buf, len, clear); 450 count = len;
451 if (count > log_buf_len)
452 count = log_buf_len;
453 spin_lock_irq(&logbuf_lock);
454 if (count > logged_chars)
455 count = logged_chars;
456 if (do_clear)
457 logged_chars = 0;
458 limit = log_end;
459 /*
460 * __put_user() could sleep, and while we sleep
461 * printk() could overwrite the messages
462 * we try to copy to user space. Therefore
463 * the messages are copied in reverse. <manfreds>
464 */
465 for (i = 0; i < count && !error; i++) {
466 j = limit-1-i;
467 if (j + log_buf_len < log_end)
468 break;
469 c = LOG_BUF(j);
470 spin_unlock_irq(&logbuf_lock);
471 error = __put_user(c,&buf[count-1-i]);
472 cond_resched();
473 spin_lock_irq(&logbuf_lock);
474 }
475 spin_unlock_irq(&logbuf_lock);
476 if (error)
477 break;
478 error = i;
479 if (i != count) {
480 int offset = count-error;
481 /* buffer overflow during copy, correct user buffer. */
482 for (i = 0; i < error; i++) {
483 if (__get_user(c,&buf[i+offset]) ||
484 __put_user(c,&buf[i])) {
485 error = -EFAULT;
486 break;
487 }
488 cond_resched();
489 }
490 }
1177 break; 491 break;
1178 /* Clear ring buffer */ 492 /* Clear ring buffer */
1179 case SYSLOG_ACTION_CLEAR: 493 case SYSLOG_ACTION_CLEAR:
1180 syslog_print_all(NULL, 0, true); 494 logged_chars = 0;
1181 break; 495 break;
1182 /* Disable logging to console */ 496 /* Disable logging to console */
1183 case SYSLOG_ACTION_CONSOLE_OFF: 497 case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1206,38 +520,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1206 break; 520 break;
1207 /* Number of chars in the log buffer */ 521 /* Number of chars in the log buffer */
1208 case SYSLOG_ACTION_SIZE_UNREAD: 522 case SYSLOG_ACTION_SIZE_UNREAD:
1209 raw_spin_lock_irq(&logbuf_lock); 523 error = log_end - log_start;
1210 if (syslog_seq < log_first_seq) {
1211 /* messages are gone, move to first one */
1212 syslog_seq = log_first_seq;
1213 syslog_idx = log_first_idx;
1214 syslog_prev = 0;
1215 syslog_partial = 0;
1216 }
1217 if (from_file) {
1218 /*
1219 * Short-cut for poll(/"proc/kmsg") which simply checks
1220 * for pending data, not the size; return the count of
1221 * records, not the length.
1222 */
1223 error = log_next_idx - syslog_idx;
1224 } else {
1225 u64 seq = syslog_seq;
1226 u32 idx = syslog_idx;
1227 enum log_flags prev = syslog_prev;
1228
1229 error = 0;
1230 while (seq < log_next_seq) {
1231 struct log *msg = log_from_idx(idx);
1232
1233 error += msg_print_text(msg, prev, true, NULL, 0);
1234 idx = log_next(idx);
1235 seq++;
1236 prev = msg->flags;
1237 }
1238 error -= syslog_partial;
1239 }
1240 raw_spin_unlock_irq(&logbuf_lock);
1241 break; 524 break;
1242 /* Size of the log buffer */ 525 /* Size of the log buffer */
1243 case SYSLOG_ACTION_SIZE_BUFFER: 526 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1256,34 +539,189 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1256 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 539 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1257} 540}
1258 541
542#ifdef CONFIG_KGDB_KDB
543/* kdb dmesg command needs access to the syslog buffer. do_syslog()
544 * uses locks so it cannot be used during debugging. Just tell kdb
545 * where the start and end of the physical and logical logs are. This
546 * is equivalent to do_syslog(3).
547 */
548void kdb_syslog_data(char *syslog_data[4])
549{
550 syslog_data[0] = log_buf;
551 syslog_data[1] = log_buf + log_buf_len;
552 syslog_data[2] = log_buf + log_end -
553 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
554 syslog_data[3] = log_buf + log_end;
555}
556#endif /* CONFIG_KGDB_KDB */
557
1259/* 558/*
1260 * Call the console drivers, asking them to write out 559 * Call the console drivers on a range of log_buf
1261 * log_buf[start] to log_buf[end - 1].
1262 * The console_lock must be held.
1263 */ 560 */
1264static void call_console_drivers(int level, const char *text, size_t len) 561static void __call_console_drivers(unsigned start, unsigned end)
1265{ 562{
1266 struct console *con; 563 struct console *con;
1267 564
1268 trace_console(text, 0, len, len);
1269
1270 if (level >= console_loglevel && !ignore_loglevel)
1271 return;
1272 if (!console_drivers)
1273 return;
1274
1275 for_each_console(con) { 565 for_each_console(con) {
1276 if (exclusive_console && con != exclusive_console) 566 if (exclusive_console && con != exclusive_console)
1277 continue; 567 continue;
1278 if (!(con->flags & CON_ENABLED)) 568 if ((con->flags & CON_ENABLED) && con->write &&
1279 continue; 569 (cpu_online(smp_processor_id()) ||
1280 if (!con->write) 570 (con->flags & CON_ANYTIME)))
1281 continue; 571 con->write(con, &LOG_BUF(start), end - start);
1282 if (!cpu_online(smp_processor_id()) && 572 }
1283 !(con->flags & CON_ANYTIME)) 573}
1284 continue; 574
1285 con->write(con, text, len); 575static int __read_mostly ignore_loglevel;
576
577static int __init ignore_loglevel_setup(char *str)
578{
579 ignore_loglevel = 1;
580 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
581
582 return 0;
583}
584
585early_param("ignore_loglevel", ignore_loglevel_setup);
586
587/*
588 * Write out chars from start to end - 1 inclusive
589 */
590static void _call_console_drivers(unsigned start,
591 unsigned end, int msg_log_level)
592{
593 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
594 console_drivers && start != end) {
595 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
596 /* wrapped write */
597 __call_console_drivers(start & LOG_BUF_MASK,
598 log_buf_len);
599 __call_console_drivers(0, end & LOG_BUF_MASK);
600 } else {
601 __call_console_drivers(start, end);
602 }
603 }
604}
605
606/*
607 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
608 * lower 3 bit are the log level, the rest are the log facility. In case
609 * userspace passes usual userspace syslog messages to /dev/kmsg or
610 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
611 * to extract the correct log level for in-kernel processing, and not mangle
612 * the original value.
613 *
614 * If a prefix is found, the length of the prefix is returned. If 'level' is
615 * passed, it will be filled in with the log level without a possible facility
616 * value. If 'special' is passed, the special printk prefix chars are accepted
617 * and returned. If no valid header is found, 0 is returned and the passed
618 * variables are not touched.
619 */
620static size_t log_prefix(const char *p, unsigned int *level, char *special)
621{
622 unsigned int lev = 0;
623 char sp = '\0';
624 size_t len;
625
626 if (p[0] != '<' || !p[1])
627 return 0;
628 if (p[2] == '>') {
629 /* usual single digit level number or special char */
630 switch (p[1]) {
631 case '0' ... '7':
632 lev = p[1] - '0';
633 break;
634 case 'c': /* KERN_CONT */
635 case 'd': /* KERN_DEFAULT */
636 sp = p[1];
637 break;
638 default:
639 return 0;
640 }
641 len = 3;
642 } else {
643 /* multi digit including the level and facility number */
644 char *endp = NULL;
645
646 if (p[1] < '0' && p[1] > '9')
647 return 0;
648
649 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
650 if (endp == NULL || endp[0] != '>')
651 return 0;
652 len = (endp + 1) - p;
653 }
654
655 /* do not accept special char if not asked for */
656 if (sp && !special)
657 return 0;
658
659 if (special) {
660 *special = sp;
661 /* return special char, do not touch level */
662 if (sp)
663 return len;
664 }
665
666 if (level)
667 *level = lev;
668 return len;
669}
670
671/*
672 * Call the console drivers, asking them to write out
673 * log_buf[start] to log_buf[end - 1].
674 * The console_lock must be held.
675 */
676static void call_console_drivers(unsigned start, unsigned end)
677{
678 unsigned cur_index, start_print;
679 static int msg_level = -1;
680
681 BUG_ON(((int)(start - end)) > 0);
682
683 cur_index = start;
684 start_print = start;
685 while (cur_index != end) {
686 if (msg_level < 0 && ((end - cur_index) > 2)) {
687 /* strip log prefix */
688 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
689 start_print = cur_index;
690 }
691 while (cur_index != end) {
692 char c = LOG_BUF(cur_index);
693
694 cur_index++;
695 if (c == '\n') {
696 if (msg_level < 0) {
697 /*
698 * printk() has already given us loglevel tags in
699 * the buffer. This code is here in case the
700 * log buffer has wrapped right round and scribbled
701 * on those tags
702 */
703 msg_level = default_message_loglevel;
704 }
705 _call_console_drivers(start_print, cur_index, msg_level);
706 msg_level = -1;
707 start_print = cur_index;
708 break;
709 }
710 }
1286 } 711 }
712 _call_console_drivers(start_print, end, msg_level);
713}
714
715static void emit_log_char(char c)
716{
717 LOG_BUF(log_end) = c;
718 log_end++;
719 if (log_end - log_start > log_buf_len)
720 log_start = log_end - log_buf_len;
721 if (log_end - con_start > log_buf_len)
722 con_start = log_end - log_buf_len;
723 if (logged_chars < log_buf_len)
724 logged_chars++;
1287} 725}
1288 726
1289/* 727/*
@@ -1301,13 +739,19 @@ static void zap_locks(void)
1301 739
1302 oops_timestamp = jiffies; 740 oops_timestamp = jiffies;
1303 741
1304 debug_locks_off();
1305 /* If a crash is occurring, make sure we can't deadlock */ 742 /* If a crash is occurring, make sure we can't deadlock */
1306 raw_spin_lock_init(&logbuf_lock); 743 spin_lock_init(&logbuf_lock);
1307 /* And make sure that we print immediately */ 744 /* And make sure that we print immediately */
1308 sema_init(&console_sem, 1); 745 sema_init(&console_sem, 1);
1309} 746}
1310 747
748#if defined(CONFIG_PRINTK_TIME)
749static int printk_time = 1;
750#else
751static int printk_time = 0;
752#endif
753module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
754
1311/* Check if we have any console registered that can be called early in boot. */ 755/* Check if we have any console registered that can be called early in boot. */
1312static int have_callable_console(void) 756static int have_callable_console(void)
1313{ 757{
@@ -1320,6 +764,51 @@ static int have_callable_console(void)
1320 return 0; 764 return 0;
1321} 765}
1322 766
767/**
768 * printk - print a kernel message
769 * @fmt: format string
770 *
771 * This is printk(). It can be called from any context. We want it to work.
772 *
773 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
774 * call the console drivers. If we fail to get the semaphore we place the output
775 * into the log buffer and return. The current holder of the console_sem will
776 * notice the new output in console_unlock(); and will send it to the
777 * consoles before releasing the lock.
778 *
779 * One effect of this deferred printing is that code which calls printk() and
780 * then changes console_loglevel may break. This is because console_loglevel
781 * is inspected when the actual printing occurs.
782 *
783 * See also:
784 * printf(3)
785 *
786 * See the vsnprintf() documentation for format string extensions over C99.
787 */
788
789asmlinkage int printk(const char *fmt, ...)
790{
791 va_list args;
792 int r;
793
794#ifdef CONFIG_KGDB_KDB
795 if (unlikely(kdb_trap_printk)) {
796 va_start(args, fmt);
797 r = vkdb_printf(fmt, args);
798 va_end(args);
799 return r;
800 }
801#endif
802 va_start(args, fmt);
803 r = vprintk(fmt, args);
804 va_end(args);
805
806 return r;
807}
808
809/* cpu currently holding logbuf_lock */
810static volatile unsigned int printk_cpu = UINT_MAX;
811
1323/* 812/*
1324 * Can we actually use the console at this time on this cpu? 813 * Can we actually use the console at this time on this cpu?
1325 * 814 *
@@ -1363,12 +852,17 @@ static int console_trylock_for_printk(unsigned int cpu)
1363 retval = 0; 852 retval = 0;
1364 } 853 }
1365 } 854 }
1366 logbuf_cpu = UINT_MAX; 855 printk_cpu = UINT_MAX;
856 spin_unlock(&logbuf_lock);
1367 if (wake) 857 if (wake)
1368 up(&console_sem); 858 up(&console_sem);
1369 raw_spin_unlock(&logbuf_lock);
1370 return retval; 859 return retval;
1371} 860}
861static const char recursion_bug_msg [] =
862 KERN_CRIT "BUG: recent printk recursion!\n";
863static int recursion_bug;
864static int new_text_line = 1;
865static char printk_buf[1024];
1372 866
1373int printk_delay_msec __read_mostly; 867int printk_delay_msec __read_mostly;
1374 868
@@ -1384,134 +878,28 @@ static inline void printk_delay(void)
1384 } 878 }
1385} 879}
1386 880
1387/* 881asmlinkage int vprintk(const char *fmt, va_list args)
1388 * Continuation lines are buffered, and not committed to the record buffer
1389 * until the line is complete, or a race forces it. The line fragments
1390 * though, are printed immediately to the consoles to ensure everything has
1391 * reached the console in case of a kernel crash.
1392 */
1393static struct cont {
1394 char buf[LOG_LINE_MAX];
1395 size_t len; /* length == 0 means unused buffer */
1396 size_t cons; /* bytes written to console */
1397 struct task_struct *owner; /* task of first print*/
1398 u64 ts_nsec; /* time of first print */
1399 u8 level; /* log level of first message */
1400 u8 facility; /* log level of first message */
1401 enum log_flags flags; /* prefix, newline flags */
1402 bool flushed:1; /* buffer sealed and committed */
1403} cont;
1404
1405static void cont_flush(enum log_flags flags)
1406{
1407 if (cont.flushed)
1408 return;
1409 if (cont.len == 0)
1410 return;
1411
1412 if (cont.cons) {
1413 /*
1414 * If a fragment of this line was directly flushed to the
1415 * console; wait for the console to pick up the rest of the
1416 * line. LOG_NOCONS suppresses a duplicated output.
1417 */
1418 log_store(cont.facility, cont.level, flags | LOG_NOCONS,
1419 cont.ts_nsec, NULL, 0, cont.buf, cont.len);
1420 cont.flags = flags;
1421 cont.flushed = true;
1422 } else {
1423 /*
1424 * If no fragment of this line ever reached the console,
1425 * just submit it to the store and free the buffer.
1426 */
1427 log_store(cont.facility, cont.level, flags, 0,
1428 NULL, 0, cont.buf, cont.len);
1429 cont.len = 0;
1430 }
1431}
1432
1433static bool cont_add(int facility, int level, const char *text, size_t len)
1434{
1435 if (cont.len && cont.flushed)
1436 return false;
1437
1438 if (cont.len + len > sizeof(cont.buf)) {
1439 /* the line gets too long, split it up in separate records */
1440 cont_flush(LOG_CONT);
1441 return false;
1442 }
1443
1444 if (!cont.len) {
1445 cont.facility = facility;
1446 cont.level = level;
1447 cont.owner = current;
1448 cont.ts_nsec = local_clock();
1449 cont.flags = 0;
1450 cont.cons = 0;
1451 cont.flushed = false;
1452 }
1453
1454 memcpy(cont.buf + cont.len, text, len);
1455 cont.len += len;
1456
1457 if (cont.len > (sizeof(cont.buf) * 80) / 100)
1458 cont_flush(LOG_CONT);
1459
1460 return true;
1461}
1462
1463static size_t cont_print_text(char *text, size_t size)
1464{
1465 size_t textlen = 0;
1466 size_t len;
1467
1468 if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
1469 textlen += print_time(cont.ts_nsec, text);
1470 size -= textlen;
1471 }
1472
1473 len = cont.len - cont.cons;
1474 if (len > 0) {
1475 if (len+1 > size)
1476 len = size-1;
1477 memcpy(text + textlen, cont.buf + cont.cons, len);
1478 textlen += len;
1479 cont.cons = cont.len;
1480 }
1481
1482 if (cont.flushed) {
1483 if (cont.flags & LOG_NEWLINE)
1484 text[textlen++] = '\n';
1485 /* got everything, release buffer */
1486 cont.len = 0;
1487 }
1488 return textlen;
1489}
1490
1491asmlinkage int vprintk_emit(int facility, int level,
1492 const char *dict, size_t dictlen,
1493 const char *fmt, va_list args)
1494{ 882{
1495 static int recursion_bug; 883 int printed_len = 0;
1496 static char textbuf[LOG_LINE_MAX]; 884 int current_log_level = default_message_loglevel;
1497 char *text = textbuf;
1498 size_t text_len;
1499 enum log_flags lflags = 0;
1500 unsigned long flags; 885 unsigned long flags;
1501 int this_cpu; 886 int this_cpu;
1502 int printed_len = 0; 887 char *p;
888 size_t plen;
889 char special;
1503 890
1504 boot_delay_msec(level); 891 boot_delay_msec();
1505 printk_delay(); 892 printk_delay();
1506 893
894 preempt_disable();
1507 /* This stops the holder of console_sem just where we want him */ 895 /* This stops the holder of console_sem just where we want him */
1508 local_irq_save(flags); 896 raw_local_irq_save(flags);
1509 this_cpu = smp_processor_id(); 897 this_cpu = smp_processor_id();
1510 898
1511 /* 899 /*
1512 * Ouch, printk recursed into itself! 900 * Ouch, printk recursed into itself!
1513 */ 901 */
1514 if (unlikely(logbuf_cpu == this_cpu)) { 902 if (unlikely(printk_cpu == this_cpu)) {
1515 /* 903 /*
1516 * If a crash is occurring during printk() on this CPU, 904 * If a crash is occurring during printk() on this CPU,
1517 * then try to get the crash message out but make sure 905 * then try to get the crash message out but make sure
@@ -1519,7 +907,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1519 * recursion and return - but flag the recursion so that 907 * recursion and return - but flag the recursion so that
1520 * it can be printed at the next appropriate moment: 908 * it can be printed at the next appropriate moment:
1521 */ 909 */
1522 if (!oops_in_progress && !lockdep_recursing(current)) { 910 if (!oops_in_progress) {
1523 recursion_bug = 1; 911 recursion_bug = 1;
1524 goto out_restore_irqs; 912 goto out_restore_irqs;
1525 } 913 }
@@ -1527,201 +915,123 @@ asmlinkage int vprintk_emit(int facility, int level,
1527 } 915 }
1528 916
1529 lockdep_off(); 917 lockdep_off();
1530 raw_spin_lock(&logbuf_lock); 918 spin_lock(&logbuf_lock);
1531 logbuf_cpu = this_cpu; 919 printk_cpu = this_cpu;
1532 920
1533 if (recursion_bug) { 921 if (recursion_bug) {
1534 static const char recursion_msg[] =
1535 "BUG: recent printk recursion!";
1536
1537 recursion_bug = 0; 922 recursion_bug = 0;
1538 printed_len += strlen(recursion_msg); 923 strcpy(printk_buf, recursion_bug_msg);
1539 /* emit KERN_CRIT message */ 924 printed_len = strlen(recursion_bug_msg);
1540 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1541 NULL, 0, recursion_msg, printed_len);
1542 } 925 }
926 /* Emit the output into the temporary buffer */
927 printed_len += vscnprintf(printk_buf + printed_len,
928 sizeof(printk_buf) - printed_len, fmt, args);
1543 929
1544 /* 930#ifdef CONFIG_DEBUG_LL
1545 * The printf needs to come first; we need the syslog 931 printascii(printk_buf);
1546 * prefix which might be passed-in as a parameter. 932#endif
1547 */
1548 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
1549 933
1550 /* mark and strip a trailing newline */ 934 p = printk_buf;
1551 if (text_len && text[text_len-1] == '\n') {
1552 text_len--;
1553 lflags |= LOG_NEWLINE;
1554 }
1555 935
1556 /* strip kernel syslog prefix and extract log level or control flags */ 936 /* Read log level and handle special printk prefix */
1557 if (facility == 0) { 937 plen = log_prefix(p, &current_log_level, &special);
1558 int kern_level = printk_get_level(text); 938 if (plen) {
1559 939 p += plen;
1560 if (kern_level) { 940
1561 const char *end_of_header = printk_skip_level(text); 941 switch (special) {
1562 switch (kern_level) { 942 case 'c': /* Strip <c> KERN_CONT, continue line */
1563 case '0' ... '7': 943 plen = 0;
1564 if (level == -1) 944 break;
1565 level = kern_level - '0'; 945 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
1566 case 'd': /* KERN_DEFAULT */ 946 plen = 0;
1567 lflags |= LOG_PREFIX; 947 default:
1568 case 'c': /* KERN_CONT */ 948 if (!new_text_line) {
1569 break; 949 emit_log_char('\n');
950 new_text_line = 1;
1570 } 951 }
1571 text_len -= end_of_header - text;
1572 text = (char *)end_of_header;
1573 } 952 }
1574 } 953 }
1575 954
1576 if (level == -1) 955 /*
1577 level = default_message_loglevel; 956 * Copy the output into log_buf. If the caller didn't provide
1578 957 * the appropriate log prefix, we insert them here
1579 if (dict) 958 */
1580 lflags |= LOG_PREFIX|LOG_NEWLINE; 959 for (; *p; p++) {
1581 960 if (new_text_line) {
1582 if (!(lflags & LOG_NEWLINE)) { 961 new_text_line = 0;
1583 /* 962
1584 * Flush the conflicting buffer. An earlier newline was missing, 963 if (plen) {
1585 * or another task also prints continuation lines. 964 /* Copy original log prefix */
1586 */ 965 int i;
1587 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) 966
1588 cont_flush(LOG_NEWLINE); 967 for (i = 0; i < plen; i++)
968 emit_log_char(printk_buf[i]);
969 printed_len += plen;
970 } else {
971 /* Add log prefix */
972 emit_log_char('<');
973 emit_log_char(current_log_level + '0');
974 emit_log_char('>');
975 printed_len += 3;
976 }
1589 977
1590 /* buffer line if possible, otherwise store it right away */ 978 if (printk_time) {
1591 if (!cont_add(facility, level, text, text_len)) 979 /* Add the current time stamp */
1592 log_store(facility, level, lflags | LOG_CONT, 0, 980 char tbuf[50], *tp;
1593 dict, dictlen, text, text_len); 981 unsigned tlen;
1594 } else { 982 unsigned long long t;
1595 bool stored = false; 983 unsigned long nanosec_rem;
984
985 t = cpu_clock(printk_cpu);
986 nanosec_rem = do_div(t, 1000000000);
987 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
988 (unsigned long) t,
989 nanosec_rem / 1000);
990
991 for (tp = tbuf; tp < tbuf + tlen; tp++)
992 emit_log_char(*tp);
993 printed_len += tlen;
994 }
1596 995
1597 /* 996 if (!*p)
1598 * If an earlier newline was missing and it was the same task, 997 break;
1599 * either merge it with the current buffer and flush, or if
1600 * there was a race with interrupts (prefix == true) then just
1601 * flush it out and store this line separately.
1602 */
1603 if (cont.len && cont.owner == current) {
1604 if (!(lflags & LOG_PREFIX))
1605 stored = cont_add(facility, level, text, text_len);
1606 cont_flush(LOG_NEWLINE);
1607 } 998 }
1608 999
1609 if (!stored) 1000 emit_log_char(*p);
1610 log_store(facility, level, lflags, 0, 1001 if (*p == '\n')
1611 dict, dictlen, text, text_len); 1002 new_text_line = 1;
1612 } 1003 }
1613 printed_len += text_len;
1614 1004
1615 /* 1005 /*
1616 * Try to acquire and then immediately release the console semaphore. 1006 * Try to acquire and then immediately release the
1617 * The release will print out buffers and wake up /dev/kmsg and syslog() 1007 * console semaphore. The release will do all the
1618 * users. 1008 * actual magic (print out buffers, wake up klogd,
1009 * etc).
1619 * 1010 *
1620 * The console_trylock_for_printk() function will release 'logbuf_lock' 1011 * The console_trylock_for_printk() function
1621 * regardless of whether it actually gets the console semaphore or not. 1012 * will release 'logbuf_lock' regardless of whether it
1013 * actually gets the semaphore or not.
1622 */ 1014 */
1623 if (console_trylock_for_printk(this_cpu)) 1015 if (console_trylock_for_printk(this_cpu))
1624 console_unlock(); 1016 console_unlock();
1625 1017
1626 lockdep_on(); 1018 lockdep_on();
1627out_restore_irqs: 1019out_restore_irqs:
1628 local_irq_restore(flags); 1020 raw_local_irq_restore(flags);
1629 1021
1022 preempt_enable();
1630 return printed_len; 1023 return printed_len;
1631} 1024}
1632EXPORT_SYMBOL(vprintk_emit); 1025EXPORT_SYMBOL(printk);
1633
1634asmlinkage int vprintk(const char *fmt, va_list args)
1635{
1636 return vprintk_emit(0, -1, NULL, 0, fmt, args);
1637}
1638EXPORT_SYMBOL(vprintk); 1026EXPORT_SYMBOL(vprintk);
1639 1027
1640asmlinkage int printk_emit(int facility, int level, 1028#else
1641 const char *dict, size_t dictlen,
1642 const char *fmt, ...)
1643{
1644 va_list args;
1645 int r;
1646
1647 va_start(args, fmt);
1648 r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
1649 va_end(args);
1650
1651 return r;
1652}
1653EXPORT_SYMBOL(printk_emit);
1654 1029
1655/** 1030static void call_console_drivers(unsigned start, unsigned end)
1656 * printk - print a kernel message
1657 * @fmt: format string
1658 *
1659 * This is printk(). It can be called from any context. We want it to work.
1660 *
1661 * We try to grab the console_lock. If we succeed, it's easy - we log the
1662 * output and call the console drivers. If we fail to get the semaphore, we
1663 * place the output into the log buffer and return. The current holder of
1664 * the console_sem will notice the new output in console_unlock(); and will
1665 * send it to the consoles before releasing the lock.
1666 *
1667 * One effect of this deferred printing is that code which calls printk() and
1668 * then changes console_loglevel may break. This is because console_loglevel
1669 * is inspected when the actual printing occurs.
1670 *
1671 * See also:
1672 * printf(3)
1673 *
1674 * See the vsnprintf() documentation for format string extensions over C99.
1675 */
1676asmlinkage int printk(const char *fmt, ...)
1677{ 1031{
1678 va_list args;
1679 int r;
1680
1681#ifdef CONFIG_KGDB_KDB
1682 if (unlikely(kdb_trap_printk)) {
1683 va_start(args, fmt);
1684 r = vkdb_printf(fmt, args);
1685 va_end(args);
1686 return r;
1687 }
1688#endif
1689 va_start(args, fmt);
1690 r = vprintk_emit(0, -1, NULL, 0, fmt, args);
1691 va_end(args);
1692
1693 return r;
1694} 1032}
1695EXPORT_SYMBOL(printk);
1696 1033
1697#else /* CONFIG_PRINTK */ 1034#endif
1698
1699#define LOG_LINE_MAX 0
1700#define PREFIX_MAX 0
1701#define LOG_LINE_MAX 0
1702static u64 syslog_seq;
1703static u32 syslog_idx;
1704static u64 console_seq;
1705static u32 console_idx;
1706static enum log_flags syslog_prev;
1707static u64 log_first_seq;
1708static u32 log_first_idx;
1709static u64 log_next_seq;
1710static enum log_flags console_prev;
1711static struct cont {
1712 size_t len;
1713 size_t cons;
1714 u8 level;
1715 bool flushed:1;
1716} cont;
1717static struct log *log_from_idx(u32 idx) { return NULL; }
1718static u32 log_next(u32 idx) { return 0; }
1719static void call_console_drivers(int level, const char *text, size_t len) {}
1720static size_t msg_print_text(const struct log *msg, enum log_flags prev,
1721 bool syslog, char *buf, size_t size) { return 0; }
1722static size_t cont_print_text(char *text, size_t size) { return 0; }
1723
1724#endif /* CONFIG_PRINTK */
1725 1035
1726static int __add_preferred_console(char *name, int idx, char *options, 1036static int __add_preferred_console(char *name, int idx, char *options,
1727 char *brl_options) 1037 char *brl_options)
@@ -1844,7 +1154,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1844 return -1; 1154 return -1;
1845} 1155}
1846 1156
1847bool console_suspend_enabled = 1; 1157int console_suspend_enabled = 1;
1848EXPORT_SYMBOL(console_suspend_enabled); 1158EXPORT_SYMBOL(console_suspend_enabled);
1849 1159
1850static int __init console_suspend_disable(char *str) 1160static int __init console_suspend_disable(char *str)
@@ -1853,10 +1163,6 @@ static int __init console_suspend_disable(char *str)
1853 return 1; 1163 return 1;
1854} 1164}
1855__setup("no_console_suspend", console_suspend_disable); 1165__setup("no_console_suspend", console_suspend_disable);
1856module_param_named(console_suspend, console_suspend_enabled,
1857 bool, S_IRUGO | S_IWUSR);
1858MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
1859 " and hibernate operations");
1860 1166
1861/** 1167/**
1862 * suspend_console - suspend the console subsystem 1168 * suspend_console - suspend the console subsystem
@@ -1917,14 +1223,12 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1917 */ 1223 */
1918void console_lock(void) 1224void console_lock(void)
1919{ 1225{
1920 might_sleep(); 1226 BUG_ON(in_interrupt());
1921
1922 down(&console_sem); 1227 down(&console_sem);
1923 if (console_suspended) 1228 if (console_suspended)
1924 return; 1229 return;
1925 console_locked = 1; 1230 console_locked = 1;
1926 console_may_schedule = 1; 1231 console_may_schedule = 1;
1927 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1928} 1232}
1929EXPORT_SYMBOL(console_lock); 1233EXPORT_SYMBOL(console_lock);
1930 1234
@@ -1946,7 +1250,6 @@ int console_trylock(void)
1946 } 1250 }
1947 console_locked = 1; 1251 console_locked = 1;
1948 console_may_schedule = 0; 1252 console_may_schedule = 0;
1949 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1950 return 1; 1253 return 1;
1951} 1254}
1952EXPORT_SYMBOL(console_trylock); 1255EXPORT_SYMBOL(console_trylock);
@@ -1956,27 +1259,13 @@ int is_console_locked(void)
1956 return console_locked; 1259 return console_locked;
1957} 1260}
1958 1261
1959/*
1960 * Delayed printk version, for scheduler-internal messages:
1961 */
1962#define PRINTK_BUF_SIZE 512
1963
1964#define PRINTK_PENDING_WAKEUP 0x01
1965#define PRINTK_PENDING_SCHED 0x02
1966
1967static DEFINE_PER_CPU(int, printk_pending); 1262static DEFINE_PER_CPU(int, printk_pending);
1968static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1969 1263
1970void printk_tick(void) 1264void printk_tick(void)
1971{ 1265{
1972 if (__this_cpu_read(printk_pending)) { 1266 if (__this_cpu_read(printk_pending)) {
1973 int pending = __this_cpu_xchg(printk_pending, 0); 1267 __this_cpu_write(printk_pending, 0);
1974 if (pending & PRINTK_PENDING_SCHED) { 1268 wake_up_interruptible(&log_wait);
1975 char *buf = __get_cpu_var(printk_sched_buf);
1976 printk(KERN_WARNING "[sched_delayed] %s", buf);
1977 }
1978 if (pending & PRINTK_PENDING_WAKEUP)
1979 wake_up_interruptible(&log_wait);
1980 } 1269 }
1981} 1270}
1982 1271
@@ -1990,36 +1279,7 @@ int printk_needs_cpu(int cpu)
1990void wake_up_klogd(void) 1279void wake_up_klogd(void)
1991{ 1280{
1992 if (waitqueue_active(&log_wait)) 1281 if (waitqueue_active(&log_wait))
1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1282 this_cpu_write(printk_pending, 1);
1994}
1995
1996static void console_cont_flush(char *text, size_t size)
1997{
1998 unsigned long flags;
1999 size_t len;
2000
2001 raw_spin_lock_irqsave(&logbuf_lock, flags);
2002
2003 if (!cont.len)
2004 goto out;
2005
2006 /*
2007 * We still queue earlier records, likely because the console was
2008 * busy. The earlier ones need to be printed before this one, we
2009 * did not flush any fragment so far, so just let it queue up.
2010 */
2011 if (console_seq < log_next_seq && !cont.cons)
2012 goto out;
2013
2014 len = cont_print_text(text, size);
2015 raw_spin_unlock(&logbuf_lock);
2016 stop_critical_timings();
2017 call_console_drivers(cont.level, text, len);
2018 start_critical_timings();
2019 local_irq_restore(flags);
2020 return;
2021out:
2022 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2023} 1283}
2024 1284
2025/** 1285/**
@@ -2032,17 +1292,15 @@ out:
2032 * by printk(). If this is the case, console_unlock(); emits 1292 * by printk(). If this is the case, console_unlock(); emits
2033 * the output prior to releasing the lock. 1293 * the output prior to releasing the lock.
2034 * 1294 *
2035 * If there is output waiting, we wake /dev/kmsg and syslog() users. 1295 * If there is output waiting for klogd, we wake it up.
2036 * 1296 *
2037 * console_unlock(); may be called from any context. 1297 * console_unlock(); may be called from any context.
2038 */ 1298 */
2039void console_unlock(void) 1299void console_unlock(void)
2040{ 1300{
2041 static char text[LOG_LINE_MAX + PREFIX_MAX];
2042 static u64 seen_seq;
2043 unsigned long flags; 1301 unsigned long flags;
2044 bool wake_klogd = false; 1302 unsigned _con_start, _log_end;
2045 bool retry; 1303 unsigned wake_klogd = 0, retry = 0;
2046 1304
2047 if (console_suspended) { 1305 if (console_suspended) {
2048 up(&console_sem); 1306 up(&console_sem);
@@ -2051,69 +1309,28 @@ void console_unlock(void)
2051 1309
2052 console_may_schedule = 0; 1310 console_may_schedule = 0;
2053 1311
2054 /* flush buffered message fragment immediately to console */
2055 console_cont_flush(text, sizeof(text));
2056again: 1312again:
2057 for (;;) { 1313 for ( ; ; ) {
2058 struct log *msg; 1314 spin_lock_irqsave(&logbuf_lock, flags);
2059 size_t len; 1315 wake_klogd |= log_start - log_end;
2060 int level; 1316 if (con_start == log_end)
2061 1317 break; /* Nothing to print */
2062 raw_spin_lock_irqsave(&logbuf_lock, flags); 1318 _con_start = con_start;
2063 if (seen_seq != log_next_seq) { 1319 _log_end = log_end;
2064 wake_klogd = true; 1320 con_start = log_end; /* Flush */
2065 seen_seq = log_next_seq; 1321 spin_unlock(&logbuf_lock);
2066 }
2067
2068 if (console_seq < log_first_seq) {
2069 /* messages are gone, move to first one */
2070 console_seq = log_first_seq;
2071 console_idx = log_first_idx;
2072 console_prev = 0;
2073 }
2074skip:
2075 if (console_seq == log_next_seq)
2076 break;
2077
2078 msg = log_from_idx(console_idx);
2079 if (msg->flags & LOG_NOCONS) {
2080 /*
2081 * Skip record we have buffered and already printed
2082 * directly to the console when we received it.
2083 */
2084 console_idx = log_next(console_idx);
2085 console_seq++;
2086 /*
2087 * We will get here again when we register a new
2088 * CON_PRINTBUFFER console. Clear the flag so we
2089 * will properly dump everything later.
2090 */
2091 msg->flags &= ~LOG_NOCONS;
2092 console_prev = msg->flags;
2093 goto skip;
2094 }
2095
2096 level = msg->level;
2097 len = msg_print_text(msg, console_prev, false,
2098 text, sizeof(text));
2099 console_idx = log_next(console_idx);
2100 console_seq++;
2101 console_prev = msg->flags;
2102 raw_spin_unlock(&logbuf_lock);
2103
2104 stop_critical_timings(); /* don't trace print latency */ 1322 stop_critical_timings(); /* don't trace print latency */
2105 call_console_drivers(level, text, len); 1323 call_console_drivers(_con_start, _log_end);
2106 start_critical_timings(); 1324 start_critical_timings();
2107 local_irq_restore(flags); 1325 local_irq_restore(flags);
2108 } 1326 }
2109 console_locked = 0; 1327 console_locked = 0;
2110 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2111 1328
2112 /* Release the exclusive_console once it is used */ 1329 /* Release the exclusive_console once it is used */
2113 if (unlikely(exclusive_console)) 1330 if (unlikely(exclusive_console))
2114 exclusive_console = NULL; 1331 exclusive_console = NULL;
2115 1332
2116 raw_spin_unlock(&logbuf_lock); 1333 spin_unlock(&logbuf_lock);
2117 1334
2118 up(&console_sem); 1335 up(&console_sem);
2119 1336
@@ -2123,10 +1340,10 @@ skip:
2123 * there's a new owner and the console_unlock() from them will do the 1340 * there's a new owner and the console_unlock() from them will do the
2124 * flush, no worries. 1341 * flush, no worries.
2125 */ 1342 */
2126 raw_spin_lock(&logbuf_lock); 1343 spin_lock(&logbuf_lock);
2127 retry = console_seq != log_next_seq; 1344 if (con_start != log_end)
2128 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1345 retry = 1;
2129 1346 spin_unlock_irqrestore(&logbuf_lock, flags);
2130 if (retry && console_trylock()) 1347 if (retry && console_trylock())
2131 goto again; 1348 goto again;
2132 1349
@@ -2359,11 +1576,9 @@ void register_console(struct console *newcon)
2359 * console_unlock(); will print out the buffered messages 1576 * console_unlock(); will print out the buffered messages
2360 * for us. 1577 * for us.
2361 */ 1578 */
2362 raw_spin_lock_irqsave(&logbuf_lock, flags); 1579 spin_lock_irqsave(&logbuf_lock, flags);
2363 console_seq = syslog_seq; 1580 con_start = log_start;
2364 console_idx = syslog_idx; 1581 spin_unlock_irqrestore(&logbuf_lock, flags);
2365 console_prev = syslog_prev;
2366 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2367 /* 1582 /*
2368 * We're about to replay the log buffer. Only do this to the 1583 * We're about to replay the log buffer. Only do this to the
2369 * just-registered console to avoid excessive message spam to 1584 * just-registered console to avoid excessive message spam to
@@ -2456,26 +1671,6 @@ late_initcall(printk_late_init);
2456 1671
2457#if defined CONFIG_PRINTK 1672#if defined CONFIG_PRINTK
2458 1673
2459int printk_sched(const char *fmt, ...)
2460{
2461 unsigned long flags;
2462 va_list args;
2463 char *buf;
2464 int r;
2465
2466 local_irq_save(flags);
2467 buf = __get_cpu_var(printk_sched_buf);
2468
2469 va_start(args, fmt);
2470 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
2471 va_end(args);
2472
2473 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
2474 local_irq_restore(flags);
2475
2476 return r;
2477}
2478
2479/* 1674/*
2480 * printk rate limiting, lifted from the networking subsystem. 1675 * printk rate limiting, lifted from the networking subsystem.
2481 * 1676 *
@@ -2571,263 +1766,47 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
2571} 1766}
2572EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1767EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
2573 1768
2574static bool always_kmsg_dump;
2575module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2576
2577/** 1769/**
2578 * kmsg_dump - dump kernel log to kernel message dumpers. 1770 * kmsg_dump - dump kernel log to kernel message dumpers.
2579 * @reason: the reason (oops, panic etc) for dumping 1771 * @reason: the reason (oops, panic etc) for dumping
2580 * 1772 *
2581 * Call each of the registered dumper's dump() callback, which can 1773 * Iterate through each of the dump devices and call the oops/panic
2582 * retrieve the kmsg records with kmsg_dump_get_line() or 1774 * callbacks with the log buffer.
2583 * kmsg_dump_get_buffer().
2584 */ 1775 */
2585void kmsg_dump(enum kmsg_dump_reason reason) 1776void kmsg_dump(enum kmsg_dump_reason reason)
2586{ 1777{
1778 unsigned long end;
1779 unsigned chars;
2587 struct kmsg_dumper *dumper; 1780 struct kmsg_dumper *dumper;
1781 const char *s1, *s2;
1782 unsigned long l1, l2;
2588 unsigned long flags; 1783 unsigned long flags;
2589 1784
2590 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) 1785 /* Theoretically, the log could move on after we do this, but
2591 return; 1786 there's not a lot we can do about that. The new messages
2592 1787 will overwrite the start of what we dump. */
2593 rcu_read_lock(); 1788 spin_lock_irqsave(&logbuf_lock, flags);
2594 list_for_each_entry_rcu(dumper, &dump_list, list) { 1789 end = log_end & LOG_BUF_MASK;
2595 if (dumper->max_reason && reason > dumper->max_reason) 1790 chars = logged_chars;
2596 continue; 1791 spin_unlock_irqrestore(&logbuf_lock, flags);
2597
2598 /* initialize iterator with data about the stored records */
2599 dumper->active = true;
2600 1792
2601 raw_spin_lock_irqsave(&logbuf_lock, flags); 1793 if (chars > end) {
2602 dumper->cur_seq = clear_seq; 1794 s1 = log_buf + log_buf_len - chars + end;
2603 dumper->cur_idx = clear_idx; 1795 l1 = chars - end;
2604 dumper->next_seq = log_next_seq;
2605 dumper->next_idx = log_next_idx;
2606 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2607
2608 /* invoke dumper which will iterate over records */
2609 dumper->dump(dumper, reason);
2610
2611 /* reset iterator */
2612 dumper->active = false;
2613 }
2614 rcu_read_unlock();
2615}
2616
2617/**
2618 * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
2619 * @dumper: registered kmsg dumper
2620 * @syslog: include the "<4>" prefixes
2621 * @line: buffer to copy the line to
2622 * @size: maximum size of the buffer
2623 * @len: length of line placed into buffer
2624 *
2625 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2626 * record, and copy one record into the provided buffer.
2627 *
2628 * Consecutive calls will return the next available record moving
2629 * towards the end of the buffer with the youngest messages.
2630 *
2631 * A return value of FALSE indicates that there are no more records to
2632 * read.
2633 *
2634 * The function is similar to kmsg_dump_get_line(), but grabs no locks.
2635 */
2636bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2637 char *line, size_t size, size_t *len)
2638{
2639 struct log *msg;
2640 size_t l = 0;
2641 bool ret = false;
2642
2643 if (!dumper->active)
2644 goto out;
2645 1796
2646 if (dumper->cur_seq < log_first_seq) { 1797 s2 = log_buf;
2647 /* messages are gone, move to first available one */ 1798 l2 = end;
2648 dumper->cur_seq = log_first_seq; 1799 } else {
2649 dumper->cur_idx = log_first_idx; 1800 s1 = "";
2650 } 1801 l1 = 0;
2651
2652 /* last entry */
2653 if (dumper->cur_seq >= log_next_seq)
2654 goto out;
2655
2656 msg = log_from_idx(dumper->cur_idx);
2657 l = msg_print_text(msg, 0, syslog, line, size);
2658
2659 dumper->cur_idx = log_next(dumper->cur_idx);
2660 dumper->cur_seq++;
2661 ret = true;
2662out:
2663 if (len)
2664 *len = l;
2665 return ret;
2666}
2667
2668/**
2669 * kmsg_dump_get_line - retrieve one kmsg log line
2670 * @dumper: registered kmsg dumper
2671 * @syslog: include the "<4>" prefixes
2672 * @line: buffer to copy the line to
2673 * @size: maximum size of the buffer
2674 * @len: length of line placed into buffer
2675 *
2676 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2677 * record, and copy one record into the provided buffer.
2678 *
2679 * Consecutive calls will return the next available record moving
2680 * towards the end of the buffer with the youngest messages.
2681 *
2682 * A return value of FALSE indicates that there are no more records to
2683 * read.
2684 */
2685bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2686 char *line, size_t size, size_t *len)
2687{
2688 unsigned long flags;
2689 bool ret;
2690
2691 raw_spin_lock_irqsave(&logbuf_lock, flags);
2692 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2693 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2694
2695 return ret;
2696}
2697EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2698
2699/**
2700 * kmsg_dump_get_buffer - copy kmsg log lines
2701 * @dumper: registered kmsg dumper
2702 * @syslog: include the "<4>" prefixes
2703 * @buf: buffer to copy the line to
2704 * @size: maximum size of the buffer
2705 * @len: length of line placed into buffer
2706 *
2707 * Start at the end of the kmsg buffer and fill the provided buffer
2708 * with as many of the the *youngest* kmsg records that fit into it.
2709 * If the buffer is large enough, all available kmsg records will be
2710 * copied with a single call.
2711 *
2712 * Consecutive calls will fill the buffer with the next block of
2713 * available older records, not including the earlier retrieved ones.
2714 *
2715 * A return value of FALSE indicates that there are no more records to
2716 * read.
2717 */
2718bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2719 char *buf, size_t size, size_t *len)
2720{
2721 unsigned long flags;
2722 u64 seq;
2723 u32 idx;
2724 u64 next_seq;
2725 u32 next_idx;
2726 enum log_flags prev;
2727 size_t l = 0;
2728 bool ret = false;
2729
2730 if (!dumper->active)
2731 goto out;
2732
2733 raw_spin_lock_irqsave(&logbuf_lock, flags);
2734 if (dumper->cur_seq < log_first_seq) {
2735 /* messages are gone, move to first available one */
2736 dumper->cur_seq = log_first_seq;
2737 dumper->cur_idx = log_first_idx;
2738 }
2739
2740 /* last entry */
2741 if (dumper->cur_seq >= dumper->next_seq) {
2742 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2743 goto out;
2744 }
2745
2746 /* calculate length of entire buffer */
2747 seq = dumper->cur_seq;
2748 idx = dumper->cur_idx;
2749 prev = 0;
2750 while (seq < dumper->next_seq) {
2751 struct log *msg = log_from_idx(idx);
2752
2753 l += msg_print_text(msg, prev, true, NULL, 0);
2754 idx = log_next(idx);
2755 seq++;
2756 prev = msg->flags;
2757 }
2758
2759 /* move first record forward until length fits into the buffer */
2760 seq = dumper->cur_seq;
2761 idx = dumper->cur_idx;
2762 prev = 0;
2763 while (l > size && seq < dumper->next_seq) {
2764 struct log *msg = log_from_idx(idx);
2765
2766 l -= msg_print_text(msg, prev, true, NULL, 0);
2767 idx = log_next(idx);
2768 seq++;
2769 prev = msg->flags;
2770 }
2771
2772 /* last message in next interation */
2773 next_seq = seq;
2774 next_idx = idx;
2775
2776 l = 0;
2777 prev = 0;
2778 while (seq < dumper->next_seq) {
2779 struct log *msg = log_from_idx(idx);
2780 1802
2781 l += msg_print_text(msg, prev, syslog, buf + l, size - l); 1803 s2 = log_buf + end - chars;
2782 idx = log_next(idx); 1804 l2 = chars;
2783 seq++;
2784 prev = msg->flags;
2785 } 1805 }
2786 1806
2787 dumper->next_seq = next_seq; 1807 rcu_read_lock();
2788 dumper->next_idx = next_idx; 1808 list_for_each_entry_rcu(dumper, &dump_list, list)
2789 ret = true; 1809 dumper->dump(dumper, reason, s1, l1, s2, l2);
2790 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1810 rcu_read_unlock();
2791out:
2792 if (len)
2793 *len = l;
2794 return ret;
2795}
2796EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2797
2798/**
2799 * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
2800 * @dumper: registered kmsg dumper
2801 *
2802 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2803 * kmsg_dump_get_buffer() can be called again and used multiple
2804 * times within the same dumper.dump() callback.
2805 *
2806 * The function is similar to kmsg_dump_rewind(), but grabs no locks.
2807 */
2808void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
2809{
2810 dumper->cur_seq = clear_seq;
2811 dumper->cur_idx = clear_idx;
2812 dumper->next_seq = log_next_seq;
2813 dumper->next_idx = log_next_idx;
2814}
2815
2816/**
2817 * kmsg_dump_rewind - reset the interator
2818 * @dumper: registered kmsg dumper
2819 *
2820 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2821 * kmsg_dump_get_buffer() can be called again and used multiple
2822 * times within the same dumper.dump() callback.
2823 */
2824void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2825{
2826 unsigned long flags;
2827
2828 raw_spin_lock_irqsave(&logbuf_lock, flags);
2829 kmsg_dump_rewind_nolock(dumper);
2830 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2831} 1811}
2832EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2833#endif 1812#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42..961b389fe52 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,13 +8,12 @@
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004 9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling, 10 * Consolidation of architecture support code for profiling,
11 * Nadia Yvette Chambers, Oracle, July 2004 11 * William Irwin, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables 12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, Nadia Yvette Chambers, 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 * Oracle, 2004
15 */ 14 */
16 15
17#include <linux/export.h> 16#include <linux/module.h>
18#include <linux/profile.h> 17#include <linux/profile.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/notifier.h> 19#include <linux/notifier.h>
@@ -257,7 +256,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
257 * pagetable hash functions, but uses a full hashtable full of finite 256 * pagetable hash functions, but uses a full hashtable full of finite
258 * collision chains, not just pairs of them. 257 * collision chains, not just pairs of them.
259 * 258 *
260 * -- nyc 259 * -- wli
261 */ 260 */
262static void __profile_flip_buffers(void *unused) 261static void __profile_flip_buffers(void *unused)
263{ 262{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1599157336a..67d1fdd3c55 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
8 */ 8 */
9 9
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/export.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
@@ -172,16 +172,7 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
172 return ret; 172 return ret;
173} 173}
174 174
175static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) 175int __ptrace_may_access(struct task_struct *task, unsigned int mode)
176{
177 if (mode & PTRACE_MODE_NOAUDIT)
178 return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
179 else
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181}
182
183/* Returns 0 on success, -errno on denial. */
184static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
185{ 176{
186 const struct cred *cred = current_cred(), *tcred; 177 const struct cred *cred = current_cred(), *tcred;
187 178
@@ -199,14 +190,15 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
199 return 0; 190 return 0;
200 rcu_read_lock(); 191 rcu_read_lock();
201 tcred = __task_cred(task); 192 tcred = __task_cred(task);
202 if (uid_eq(cred->uid, tcred->euid) && 193 if (cred->user->user_ns == tcred->user->user_ns &&
203 uid_eq(cred->uid, tcred->suid) && 194 (cred->uid == tcred->euid &&
204 uid_eq(cred->uid, tcred->uid) && 195 cred->uid == tcred->suid &&
205 gid_eq(cred->gid, tcred->egid) && 196 cred->uid == tcred->uid &&
206 gid_eq(cred->gid, tcred->sgid) && 197 cred->gid == tcred->egid &&
207 gid_eq(cred->gid, tcred->gid)) 198 cred->gid == tcred->sgid &&
199 cred->gid == tcred->gid))
208 goto ok; 200 goto ok;
209 if (ptrace_has_cap(tcred->user_ns, mode)) 201 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
210 goto ok; 202 goto ok;
211 rcu_read_unlock(); 203 rcu_read_unlock();
212 return -EPERM; 204 return -EPERM;
@@ -215,12 +207,8 @@ ok:
215 smp_rmb(); 207 smp_rmb();
216 if (task->mm) 208 if (task->mm)
217 dumpable = get_dumpable(task->mm); 209 dumpable = get_dumpable(task->mm);
218 rcu_read_lock(); 210 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
219 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
220 rcu_read_unlock();
221 return -EPERM; 211 return -EPERM;
222 }
223 rcu_read_unlock();
224 212
225 return security_ptrace_access_check(task, mode); 213 return security_ptrace_access_check(task, mode);
226} 214}
@@ -235,22 +223,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
235} 223}
236 224
237static int ptrace_attach(struct task_struct *task, long request, 225static int ptrace_attach(struct task_struct *task, long request,
238 unsigned long addr,
239 unsigned long flags) 226 unsigned long flags)
240{ 227{
241 bool seize = (request == PTRACE_SEIZE); 228 bool seize = (request == PTRACE_SEIZE);
242 int retval; 229 int retval;
243 230
231 /*
232 * SEIZE will enable new ptrace behaviors which will be implemented
233 * gradually. SEIZE_DEVEL is used to prevent applications
234 * expecting full SEIZE behaviors trapping on kernel commits which
235 * are still in the process of implementing them.
236 *
237 * Only test programs for new ptrace behaviors being implemented
238 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
239 *
240 * Once SEIZE behaviors are completely implemented, this flag and
241 * the following test will be removed.
242 */
244 retval = -EIO; 243 retval = -EIO;
245 if (seize) { 244 if (seize && !(flags & PTRACE_SEIZE_DEVEL))
246 if (addr != 0) 245 goto out;
247 goto out;
248 if (flags & ~(unsigned long)PTRACE_O_MASK)
249 goto out;
250 flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
251 } else {
252 flags = PT_PTRACED;
253 }
254 246
255 audit_ptrace(task); 247 audit_ptrace(task);
256 248
@@ -262,7 +254,7 @@ static int ptrace_attach(struct task_struct *task, long request,
262 254
263 /* 255 /*
264 * Protect exec's credential calculations against our interference; 256 * Protect exec's credential calculations against our interference;
265 * SUID, SGID and LSM creds get determined differently 257 * interference; SUID, SGID and LSM creds get determined differently
266 * under ptrace. 258 * under ptrace.
267 */ 259 */
268 retval = -ERESTARTNOINTR; 260 retval = -ERESTARTNOINTR;
@@ -282,13 +274,11 @@ static int ptrace_attach(struct task_struct *task, long request,
282 if (task->ptrace) 274 if (task->ptrace)
283 goto unlock_tasklist; 275 goto unlock_tasklist;
284 276
277 task->ptrace = PT_PTRACED;
285 if (seize) 278 if (seize)
286 flags |= PT_SEIZED; 279 task->ptrace |= PT_SEIZED;
287 rcu_read_lock(); 280 if (task_ns_capable(task, CAP_SYS_PTRACE))
288 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) 281 task->ptrace |= PT_PTRACE_CAP;
289 flags |= PT_PTRACE_CAP;
290 rcu_read_unlock();
291 task->ptrace = flags;
292 282
293 __ptrace_link(task, current); 283 __ptrace_link(task, current);
294 284
@@ -463,9 +453,6 @@ void exit_ptrace(struct task_struct *tracer)
463 return; 453 return;
464 454
465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 455 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
466 if (unlikely(p->ptrace & PT_EXITKILL))
467 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
468
469 if (__ptrace_detach(tracer, p)) 456 if (__ptrace_detach(tracer, p))
470 list_add(&p->ptrace_entry, &ptrace_dead); 457 list_add(&p->ptrace_entry, &ptrace_dead);
471 } 458 }
@@ -533,18 +520,30 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
533 520
534static int ptrace_setoptions(struct task_struct *child, unsigned long data) 521static int ptrace_setoptions(struct task_struct *child, unsigned long data)
535{ 522{
536 unsigned flags; 523 child->ptrace &= ~PT_TRACE_MASK;
537 524
538 if (data & ~(unsigned long)PTRACE_O_MASK) 525 if (data & PTRACE_O_TRACESYSGOOD)
539 return -EINVAL; 526 child->ptrace |= PT_TRACESYSGOOD;
540 527
541 /* Avoid intermediate state when all opts are cleared */ 528 if (data & PTRACE_O_TRACEFORK)
542 flags = child->ptrace; 529 child->ptrace |= PT_TRACE_FORK;
543 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
544 flags |= (data << PT_OPT_FLAG_SHIFT);
545 child->ptrace = flags;
546 530
547 return 0; 531 if (data & PTRACE_O_TRACEVFORK)
532 child->ptrace |= PT_TRACE_VFORK;
533
534 if (data & PTRACE_O_TRACECLONE)
535 child->ptrace |= PT_TRACE_CLONE;
536
537 if (data & PTRACE_O_TRACEEXEC)
538 child->ptrace |= PT_TRACE_EXEC;
539
540 if (data & PTRACE_O_TRACEVFORKDONE)
541 child->ptrace |= PT_TRACE_VFORK_DONE;
542
543 if (data & PTRACE_O_TRACEEXIT)
544 child->ptrace |= PT_TRACE_EXIT;
545
546 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
548} 547}
549 548
550static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 549static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -884,7 +883,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
884 } 883 }
885 884
886 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 885 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
887 ret = ptrace_attach(child, request, addr, data); 886 ret = ptrace_attach(child, request, data);
888 /* 887 /*
889 * Some architectures need to do book-keeping after 888 * Some architectures need to do book-keeping after
890 * a ptrace attach. 889 * a ptrace attach.
@@ -1027,7 +1026,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1027 } 1026 }
1028 1027
1029 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 1028 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
1030 ret = ptrace_attach(child, request, addr, data); 1029 ret = ptrace_attach(child, request, data);
1031 /* 1030 /*
1032 * Some architectures need to do book-keeping after 1031 * Some architectures need to do book-keeping after
1033 * a ptrace attach. 1032 * a ptrace attach.
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed6..37fa9b99ad5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Range add and subtract 2 * Range add and subtract
3 */ 3 */
4#include <linux/kernel.h> 4#include <linux/module.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7
diff --git a/kernel/rcu.h b/kernel/rcu.h
deleted file mode 100644
index 20dfba576c2..00000000000
--- a/kernel/rcu.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Read-Copy Update definitions shared among RCU implementations.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2011
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H
25
26#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */
29#define RCU_TRACE(stmt)
30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31
32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 *
37 * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
38 * that counts the number of process-based reasons why RCU cannot
39 * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
40 * is the value used to increment or decrement this field.
41 *
42 * The rest of the bits could in principle be used to count interrupts,
43 * but this would mean that a negative-one value in the interrupt
44 * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
45 * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
46 * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
47 * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
48 * initial exit from idle.
49 */
50#define DYNTICK_TASK_NEST_WIDTH 7
51#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
52#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
53#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
54#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
55#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
56 DYNTICK_TASK_FLAG)
57
58/*
59 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
60 * by call_rcu() and rcu callback execution, and are therefore not part of the
61 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
62 */
63
64#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
65# define STATE_RCU_HEAD_READY 0
66# define STATE_RCU_HEAD_QUEUED 1
67
68extern struct debug_obj_descr rcuhead_debug_descr;
69
70static inline void debug_rcu_head_queue(struct rcu_head *head)
71{
72 debug_object_activate(head, &rcuhead_debug_descr);
73 debug_object_active_state(head, &rcuhead_debug_descr,
74 STATE_RCU_HEAD_READY,
75 STATE_RCU_HEAD_QUEUED);
76}
77
78static inline void debug_rcu_head_unqueue(struct rcu_head *head)
79{
80 debug_object_active_state(head, &rcuhead_debug_descr,
81 STATE_RCU_HEAD_QUEUED,
82 STATE_RCU_HEAD_READY);
83 debug_object_deactivate(head, &rcuhead_debug_descr);
84}
85#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
86static inline void debug_rcu_head_queue(struct rcu_head *head)
87{
88}
89
90static inline void debug_rcu_head_unqueue(struct rcu_head *head)
91{
92}
93#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
94
95extern void kfree(const void *);
96
97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
98{
99 unsigned long offset = (unsigned long)head->func;
100
101 if (__is_kfree_rcu_offset(offset)) {
102 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
103 kfree((void *)head - offset);
104 return 1;
105 } else {
106 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
107 head->func(head);
108 return 0;
109 }
110}
111
112extern int rcu_expedited;
113
114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b4..ddddb320be6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,92 +43,8 @@
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/export.h>
47#include <linux/hardirq.h>
48#include <linux/delay.h>
49#include <linux/module.h> 46#include <linux/module.h>
50 47#include <linux/hardirq.h>
51#define CREATE_TRACE_POINTS
52#include <trace/events/rcu.h>
53
54#include "rcu.h"
55
56module_param(rcu_expedited, int, 0);
57
58#ifdef CONFIG_PREEMPT_RCU
59
60/*
61 * Preemptible RCU implementation for rcu_read_lock().
62 * Just increment ->rcu_read_lock_nesting, shared state will be updated
63 * if we block.
64 */
65void __rcu_read_lock(void)
66{
67 current->rcu_read_lock_nesting++;
68 barrier(); /* critical section after entry code. */
69}
70EXPORT_SYMBOL_GPL(__rcu_read_lock);
71
72/*
73 * Preemptible RCU implementation for rcu_read_unlock().
74 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
75 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
76 * invoke rcu_read_unlock_special() to clean up after a context switch
77 * in an RCU read-side critical section and other special cases.
78 */
79void __rcu_read_unlock(void)
80{
81 struct task_struct *t = current;
82
83 if (t->rcu_read_lock_nesting != 1) {
84 --t->rcu_read_lock_nesting;
85 } else {
86 barrier(); /* critical section before exit code. */
87 t->rcu_read_lock_nesting = INT_MIN;
88#ifdef CONFIG_PROVE_RCU_DELAY
89 udelay(10); /* Make preemption more probable. */
90#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
91 barrier(); /* assign before ->rcu_read_unlock_special load */
92 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
93 rcu_read_unlock_special(t);
94 barrier(); /* ->rcu_read_unlock_special load before assign */
95 t->rcu_read_lock_nesting = 0;
96 }
97#ifdef CONFIG_PROVE_LOCKING
98 {
99 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
100
101 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
102 }
103#endif /* #ifdef CONFIG_PROVE_LOCKING */
104}
105EXPORT_SYMBOL_GPL(__rcu_read_unlock);
106
107/*
108 * Check for a task exiting while in a preemptible-RCU read-side
109 * critical section, clean up if so. No need to issue warnings,
110 * as debug_check_no_locks_held() already does this if lockdep
111 * is enabled.
112 */
113void exit_rcu(void)
114{
115 struct task_struct *t = current;
116
117 if (likely(list_empty(&current->rcu_node_entry)))
118 return;
119 t->rcu_read_lock_nesting = 1;
120 barrier();
121 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
122 __rcu_read_unlock();
123}
124
125#else /* #ifdef CONFIG_PREEMPT_RCU */
126
127void exit_rcu(void)
128{
129}
130
131#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
132 48
133#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
134static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
@@ -167,34 +83,22 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
167 * section. 83 * section.
168 * 84 *
169 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
170 *
171 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
172 * offline from an RCU perspective, so check for those as well.
173 */ 86 */
174int rcu_read_lock_bh_held(void) 87int rcu_read_lock_bh_held(void)
175{ 88{
176 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
177 return 1; 90 return 1;
178 if (rcu_is_cpu_idle())
179 return 0;
180 if (!rcu_lockdep_current_cpu_online())
181 return 0;
182 return in_softirq() || irqs_disabled(); 91 return in_softirq() || irqs_disabled();
183} 92}
184EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
185 94
186#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 95#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
187 96
188struct rcu_synchronize {
189 struct rcu_head head;
190 struct completion completion;
191};
192
193/* 97/*
194 * Awaken the corresponding synchronize_rcu() instance now that a 98 * Awaken the corresponding synchronize_rcu() instance now that a
195 * grace period has elapsed. 99 * grace period has elapsed.
196 */ 100 */
197static void wakeme_after_rcu(struct rcu_head *head) 101void wakeme_after_rcu(struct rcu_head *head)
198{ 102{
199 struct rcu_synchronize *rcu; 103 struct rcu_synchronize *rcu;
200 104
@@ -202,20 +106,6 @@ static void wakeme_after_rcu(struct rcu_head *head)
202 complete(&rcu->completion); 106 complete(&rcu->completion);
203} 107}
204 108
205void wait_rcu_gp(call_rcu_func_t crf)
206{
207 struct rcu_synchronize rcu;
208
209 init_rcu_head_on_stack(&rcu.head);
210 init_completion(&rcu.completion);
211 /* Will wake me after RCU finished. */
212 crf(&rcu.head, wakeme_after_rcu);
213 /* Wait for it. */
214 wait_for_completion(&rcu.completion);
215 destroy_rcu_head_on_stack(&rcu.head);
216}
217EXPORT_SYMBOL_GPL(wait_rcu_gp);
218
219#ifdef CONFIG_PROVE_RCU 109#ifdef CONFIG_PROVE_RCU
220/* 110/*
221 * wrapper function to avoid #include problems. 111 * wrapper function to avoid #include problems.
@@ -402,13 +292,3 @@ struct debug_obj_descr rcuhead_debug_descr = {
402}; 292};
403EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 293EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 294#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
405
406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
408{
409 trace_rcu_torture_read(rcutorturename, rhp);
410}
411EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
412#else
413#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
414#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2..7bbac7d0f5a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,12 +22,13 @@
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 */ 24 */
25#include <linux/moduleparam.h>
25#include <linux/completion.h> 26#include <linux/completion.h>
26#include <linux/interrupt.h> 27#include <linux/interrupt.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
30#include <linux/export.h> 31#include <linux/module.h>
31#include <linux/mutex.h> 32#include <linux/mutex.h>
32#include <linux/sched.h> 33#include <linux/sched.h>
33#include <linux/types.h> 34#include <linux/types.h>
@@ -36,167 +37,47 @@
36#include <linux/cpu.h> 37#include <linux/cpu.h>
37#include <linux/prefetch.h> 38#include <linux/prefetch.h>
38 39
39#ifdef CONFIG_RCU_TRACE 40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40#include <trace/events/rcu.h> 41static struct task_struct *rcu_kthread_task;
41#endif /* #else #ifdef CONFIG_RCU_TRACE */ 42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 43static unsigned long have_rcu_kthread_work;
43#include "rcu.h"
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_callbacks(void); 47static void invoke_rcu_kthread(void);
48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static void rcu_process_callbacks(struct softirq_action *unused); 49static int rcu_kthread(void *arg);
50static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 56#ifdef CONFIG_NO_HZ
57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long newval)
60{
61 if (newval) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 rcu_dynticks_nesting, newval));
64 rcu_dynticks_nesting = newval;
65 return;
66 }
67 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
68 if (!is_idle_task(current)) {
69 struct task_struct *idle = idle_task(smp_processor_id());
70
71 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
72 rcu_dynticks_nesting, newval));
73 ftrace_dump(DUMP_ALL);
74 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
75 current->pid, current->comm,
76 idle->pid, idle->comm); /* must be idle task! */
77 }
78 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
79 barrier();
80 rcu_dynticks_nesting = newval;
81}
82
83/*
84 * Enter idle, which is an extended quiescent state if we have fully
85 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
86 */
87void rcu_idle_enter(void)
88{
89 unsigned long flags;
90 long long newval;
91
92 local_irq_save(flags);
93 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
94 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
95 DYNTICK_TASK_NEST_VALUE)
96 newval = 0;
97 else
98 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
99 rcu_idle_enter_common(newval);
100 local_irq_restore(flags);
101}
102EXPORT_SYMBOL_GPL(rcu_idle_enter);
103
104/*
105 * Exit an interrupt handler towards idle.
106 */
107void rcu_irq_exit(void)
108{
109 unsigned long flags;
110 long long newval;
111
112 local_irq_save(flags);
113 newval = rcu_dynticks_nesting - 1;
114 WARN_ON_ONCE(newval < 0);
115 rcu_idle_enter_common(newval);
116 local_irq_restore(flags);
117}
118EXPORT_SYMBOL_GPL(rcu_irq_exit);
119 57
120/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 58static long rcu_dynticks_nesting = 1;
121static void rcu_idle_exit_common(long long oldval)
122{
123 if (oldval) {
124 RCU_TRACE(trace_rcu_dyntick("++=",
125 oldval, rcu_dynticks_nesting));
126 return;
127 }
128 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
129 if (!is_idle_task(current)) {
130 struct task_struct *idle = idle_task(smp_processor_id());
131
132 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
133 oldval, rcu_dynticks_nesting));
134 ftrace_dump(DUMP_ALL);
135 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
136 current->pid, current->comm,
137 idle->pid, idle->comm); /* must be idle task! */
138 }
139}
140 59
141/* 60/*
142 * Exit idle, so that we are no longer in an extended quiescent state. 61 * Enter dynticks-idle mode, which is an extended quiescent state
62 * if we have fully entered that mode (i.e., if the new value of
63 * dynticks_nesting is zero).
143 */ 64 */
144void rcu_idle_exit(void) 65void rcu_enter_nohz(void)
145{ 66{
146 unsigned long flags; 67 if (--rcu_dynticks_nesting == 0)
147 long long oldval; 68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
148
149 local_irq_save(flags);
150 oldval = rcu_dynticks_nesting;
151 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
152 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
153 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
154 else
155 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
156 rcu_idle_exit_common(oldval);
157 local_irq_restore(flags);
158} 69}
159EXPORT_SYMBOL_GPL(rcu_idle_exit);
160 70
161/* 71/*
162 * Enter an interrupt handler, moving away from idle. 72 * Exit dynticks-idle mode, so that we are no longer in an extended
73 * quiescent state.
163 */ 74 */
164void rcu_irq_enter(void) 75void rcu_exit_nohz(void)
165{ 76{
166 unsigned long flags;
167 long long oldval;
168
169 local_irq_save(flags);
170 oldval = rcu_dynticks_nesting;
171 rcu_dynticks_nesting++; 77 rcu_dynticks_nesting++;
172 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
173 rcu_idle_exit_common(oldval);
174 local_irq_restore(flags);
175}
176EXPORT_SYMBOL_GPL(rcu_irq_enter);
177
178#ifdef CONFIG_DEBUG_LOCK_ALLOC
179
180/*
181 * Test whether RCU thinks that the current CPU is idle.
182 */
183int rcu_is_cpu_idle(void)
184{
185 return !rcu_dynticks_nesting;
186} 78}
187EXPORT_SYMBOL(rcu_is_cpu_idle);
188 79
189#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 80#endif /* #ifdef CONFIG_NO_HZ */
190
191/*
192 * Test whether the current CPU was interrupted from idle. Nested
193 * interrupts don't count, we must be running at the first interrupt
194 * level.
195 */
196int rcu_is_cpu_rrupt_from_idle(void)
197{
198 return rcu_dynticks_nesting <= 1;
199}
200 81
201/* 82/*
202 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -215,6 +96,16 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
215} 96}
216 97
217/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
218 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
219 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
220 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
@@ -226,7 +117,7 @@ void rcu_sched_qs(int cpu)
226 local_irq_save(flags); 117 local_irq_save(flags);
227 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
228 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
229 invoke_rcu_callbacks(); 120 invoke_rcu_kthread();
230 local_irq_restore(flags); 121 local_irq_restore(flags);
231} 122}
232 123
@@ -239,19 +130,20 @@ void rcu_bh_qs(int cpu)
239 130
240 local_irq_save(flags); 131 local_irq_save(flags);
241 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
242 invoke_rcu_callbacks(); 133 invoke_rcu_kthread();
243 local_irq_restore(flags); 134 local_irq_restore(flags);
244} 135}
245 136
246/* 137/*
247 * Check to see if the scheduling-clock interrupt came from an extended 138 * Check to see if the scheduling-clock interrupt came from an extended
248 * quiescent state, and, if so, tell RCU about it. This function must 139 * quiescent state, and, if so, tell RCU about it.
249 * be called from hardirq context. It is normally called from the
250 * scheduling-clock interrupt.
251 */ 140 */
252void rcu_check_callbacks(int cpu, int user) 141void rcu_check_callbacks(int cpu, int user)
253{ 142{
254 if (user || rcu_is_cpu_rrupt_from_idle()) 143 if (user ||
144 (idle_cpu(cpu) &&
145 !in_softirq() &&
146 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
255 rcu_sched_qs(cpu); 147 rcu_sched_qs(cpu);
256 else if (!in_softirq()) 148 else if (!in_softirq())
257 rcu_bh_qs(cpu); 149 rcu_bh_qs(cpu);
@@ -262,27 +154,18 @@ void rcu_check_callbacks(int cpu, int user)
262 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure 154 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
263 * whose grace period has elapsed. 155 * whose grace period has elapsed.
264 */ 156 */
265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 157static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
266{ 158{
267 char *rn = NULL;
268 struct rcu_head *next, *list; 159 struct rcu_head *next, *list;
269 unsigned long flags; 160 unsigned long flags;
270 RCU_TRACE(int cb_count = 0); 161 RCU_TRACE(int cb_count = 0);
271 162
272 /* If no RCU callbacks ready to invoke, just return. */ 163 /* If no RCU callbacks ready to invoke, just return. */
273 if (&rcp->rcucblist == rcp->donetail) { 164 if (&rcp->rcucblist == rcp->donetail)
274 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
275 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
276 ACCESS_ONCE(rcp->rcucblist),
277 need_resched(),
278 is_idle_task(current),
279 rcu_is_callbacks_kthread()));
280 return; 165 return;
281 }
282 166
283 /* Move the ready-to-invoke callbacks to a local list. */ 167 /* Move the ready-to-invoke callbacks to a local list. */
284 local_irq_save(flags); 168 local_irq_save(flags);
285 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
286 list = rcp->rcucblist; 169 list = rcp->rcucblist;
287 rcp->rcucblist = *rcp->donetail; 170 rcp->rcucblist = *rcp->donetail;
288 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
@@ -293,28 +176,49 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
293 local_irq_restore(flags); 176 local_irq_restore(flags);
294 177
295 /* Invoke the callbacks on the local list. */ 178 /* Invoke the callbacks on the local list. */
296 RCU_TRACE(rn = rcp->name);
297 while (list) { 179 while (list) {
298 next = list->next; 180 next = list->next;
299 prefetch(next); 181 prefetch(next);
300 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
301 local_bh_disable(); 183 local_bh_disable();
302 __rcu_reclaim(rn, list); 184 __rcu_reclaim(list);
303 local_bh_enable(); 185 local_bh_enable();
304 list = next; 186 list = next;
305 RCU_TRACE(cb_count++); 187 RCU_TRACE(cb_count++);
306 } 188 }
307 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 189 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
308 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
309 is_idle_task(current),
310 rcu_is_callbacks_kthread()));
311} 190}
312 191
313static void rcu_process_callbacks(struct softirq_action *unused) 192/*
193 * This kthread invokes RCU callbacks whose grace periods have
194 * elapsed. It is awakened as needed, and takes the place of the
195 * RCU_SOFTIRQ that was used previously for this purpose.
196 * This is a kthread, but it is never stopped, at least not until
197 * the system goes down.
198 */
199static int rcu_kthread(void *arg)
314{ 200{
315 __rcu_process_callbacks(&rcu_sched_ctrlblk); 201 unsigned long work;
316 __rcu_process_callbacks(&rcu_bh_ctrlblk); 202 unsigned long morework;
317 rcu_preempt_process_callbacks(); 203 unsigned long flags;
204
205 for (;;) {
206 wait_event_interruptible(rcu_kthread_wq,
207 have_rcu_kthread_work != 0);
208 morework = rcu_boost();
209 local_irq_save(flags);
210 work = have_rcu_kthread_work;
211 have_rcu_kthread_work = morework;
212 local_irq_restore(flags);
213 if (work) {
214 rcu_process_callbacks(&rcu_sched_ctrlblk);
215 rcu_process_callbacks(&rcu_bh_ctrlblk);
216 rcu_preempt_process_callbacks();
217 }
218 schedule_timeout_interruptible(1); /* Leave CPU for others. */
219 }
220
221 return 0; /* Not reached, but needed to shut gcc up. */
318} 222}
319 223
320/* 224/*
@@ -332,10 +236,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
332 */ 236 */
333void synchronize_sched(void) 237void synchronize_sched(void)
334{ 238{
335 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
336 !lock_is_held(&rcu_lock_map) &&
337 !lock_is_held(&rcu_sched_lock_map),
338 "Illegal synchronize_sched() in RCU read-side critical section");
339 cond_resched(); 239 cond_resched();
340} 240}
341EXPORT_SYMBOL_GPL(synchronize_sched); 241EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -380,3 +280,45 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
380 __call_rcu(head, func, &rcu_bh_ctrlblk); 280 __call_rcu(head, func, &rcu_bh_ctrlblk);
381} 281}
382EXPORT_SYMBOL_GPL(call_rcu_bh); 282EXPORT_SYMBOL_GPL(call_rcu_bh);
283
284void rcu_barrier_bh(void)
285{
286 struct rcu_synchronize rcu;
287
288 init_rcu_head_on_stack(&rcu.head);
289 init_completion(&rcu.completion);
290 /* Will wake me after RCU finished. */
291 call_rcu_bh(&rcu.head, wakeme_after_rcu);
292 /* Wait for it. */
293 wait_for_completion(&rcu.completion);
294 destroy_rcu_head_on_stack(&rcu.head);
295}
296EXPORT_SYMBOL_GPL(rcu_barrier_bh);
297
298void rcu_barrier_sched(void)
299{
300 struct rcu_synchronize rcu;
301
302 init_rcu_head_on_stack(&rcu.head);
303 init_completion(&rcu.completion);
304 /* Will wake me after RCU finished. */
305 call_rcu_sched(&rcu.head, wakeme_after_rcu);
306 /* Wait for it. */
307 wait_for_completion(&rcu.completion);
308 destroy_rcu_head_on_stack(&rcu.head);
309}
310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
311
312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
316{
317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309..f259c676195 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,30 +23,32 @@
23 */ 23 */
24 24
25#include <linux/kthread.h> 25#include <linux/kthread.h>
26#include <linux/module.h>
27#include <linux/debugfs.h> 26#include <linux/debugfs.h>
28#include <linux/seq_file.h> 27#include <linux/seq_file.h>
29 28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
30/* Global control variables for rcupdate callback mechanism. */ 35/* Global control variables for rcupdate callback mechanism. */
31struct rcu_ctrlblk { 36struct rcu_ctrlblk {
32 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */ 39 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */ 40 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(char *name); /* Name of RCU type. */
37}; 41};
38 42
39/* Definition for rcupdate control block. */ 43/* Definition for rcupdate control block. */
40static struct rcu_ctrlblk rcu_sched_ctrlblk = { 44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
41 .donetail = &rcu_sched_ctrlblk.rcucblist, 45 .donetail = &rcu_sched_ctrlblk.rcucblist,
42 .curtail = &rcu_sched_ctrlblk.rcucblist, 46 .curtail = &rcu_sched_ctrlblk.rcucblist,
43 RCU_TRACE(.name = "rcu_sched")
44}; 47};
45 48
46static struct rcu_ctrlblk rcu_bh_ctrlblk = { 49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
47 .donetail = &rcu_bh_ctrlblk.rcucblist, 50 .donetail = &rcu_bh_ctrlblk.rcucblist,
48 .curtail = &rcu_bh_ctrlblk.rcucblist, 51 .curtail = &rcu_bh_ctrlblk.rcucblist,
49 RCU_TRACE(.name = "rcu_bh")
50}; 52};
51 53
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 54#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -129,7 +131,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
129 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, 131 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
130 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, 132 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
131 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), 133 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 134};
134 135
135static int rcu_preempted_readers_exp(void); 136static int rcu_preempted_readers_exp(void);
@@ -146,16 +147,6 @@ static int rcu_cpu_blocking_cur_gp(void)
146/* 147/*
147 * Check for a running RCU reader. Because there is only one CPU, 148 * Check for a running RCU reader. Because there is only one CPU,
148 * there can be but one running RCU reader at a time. ;-) 149 * there can be but one running RCU reader at a time. ;-)
149 *
150 * Returns zero if there are no running readers. Returns a positive
151 * number if there is at least one reader within its RCU read-side
152 * critical section. Returns a negative number if an outermost reader
153 * is in the midst of exiting from its RCU read-side critical section
154 *
155 * Returns zero if there are no running readers. Returns a positive
156 * number if there is at least one reader within its RCU read-side
157 * critical section. Returns a negative number if an outermost reader
158 * is in the midst of exiting from its RCU read-side critical section.
159 */ 150 */
160static int rcu_preempt_running_reader(void) 151static int rcu_preempt_running_reader(void)
161{ 152{
@@ -256,13 +247,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
256 247
257#include "rtmutex_common.h" 248#include "rtmutex_common.h"
258 249
259#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
260
261/* Controls for rcu_kthread() kthread. */
262static struct task_struct *rcu_kthread_task;
263static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
264static unsigned long have_rcu_kthread_work;
265
266/* 250/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks, 251 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list. 252 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -278,7 +262,7 @@ static int rcu_boost(void)
278 rcu_preempt_ctrlblk.exp_tasks == NULL) 262 rcu_preempt_ctrlblk.exp_tasks == NULL)
279 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
280 264
281 local_irq_save(flags); 265 raw_local_irq_save(flags);
282 266
283 /* 267 /*
284 * Recheck with irqs disabled: all tasks in need of boosting 268 * Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +271,7 @@ static int rcu_boost(void)
287 */ 271 */
288 if (rcu_preempt_ctrlblk.boost_tasks == NULL && 272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
289 rcu_preempt_ctrlblk.exp_tasks == NULL) { 273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
290 local_irq_restore(flags); 274 raw_local_irq_restore(flags);
291 return 0; 275 return 0;
292 } 276 }
293 277
@@ -317,12 +301,13 @@ static int rcu_boost(void)
317 t = container_of(tb, struct task_struct, rcu_node_entry); 301 t = container_of(tb, struct task_struct, rcu_node_entry);
318 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
319 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
320 local_irq_restore(flags); 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
305 raw_local_irq_restore(flags);
321 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
323 308
324 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
325 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
326} 311}
327 312
328/* 313/*
@@ -349,10 +334,9 @@ static int rcu_initiate_boost(void)
349 if (rcu_preempt_ctrlblk.exp_tasks == NULL) 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
350 rcu_preempt_ctrlblk.boost_tasks = 335 rcu_preempt_ctrlblk.boost_tasks =
351 rcu_preempt_ctrlblk.gp_tasks; 336 rcu_preempt_ctrlblk.gp_tasks;
352 invoke_rcu_callbacks(); 337 invoke_rcu_kthread();
353 } else { 338 } else
354 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
355 }
356 return 1; 340 return 1;
357} 341}
358 342
@@ -369,6 +353,14 @@ static void rcu_preempt_boost_start_gp(void)
369#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
370 354
371/* 355/*
356 * If there is no RCU priority boosting, we don't boost.
357 */
358static int rcu_boost(void)
359{
360 return 0;
361}
362
363/*
372 * If there is no RCU priority boosting, we don't initiate boosting, 364 * If there is no RCU priority boosting, we don't initiate boosting,
373 * but we do indicate whether there are blocked readers blocking the 365 * but we do indicate whether there are blocked readers blocking the
374 * current grace period. 366 * current grace period.
@@ -435,7 +427,7 @@ static void rcu_preempt_cpu_qs(void)
435 427
436 /* If there are done callbacks, cause them to be invoked. */ 428 /* If there are done callbacks, cause them to be invoked. */
437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 429 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
438 invoke_rcu_callbacks(); 430 invoke_rcu_kthread();
439} 431}
440 432
441/* 433/*
@@ -485,7 +477,7 @@ void rcu_preempt_note_context_switch(void)
485 unsigned long flags; 477 unsigned long flags;
486 478
487 local_irq_save(flags); /* must exclude scheduler_tick(). */ 479 local_irq_save(flags); /* must exclude scheduler_tick(). */
488 if (rcu_preempt_running_reader() > 0 && 480 if (rcu_preempt_running_reader() &&
489 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 481 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
490 482
491 /* Possibly blocking in an RCU read-side critical section. */ 483 /* Possibly blocking in an RCU read-side critical section. */
@@ -504,13 +496,6 @@ void rcu_preempt_note_context_switch(void)
504 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); 496 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
505 if (rcu_cpu_blocking_cur_gp()) 497 if (rcu_cpu_blocking_cur_gp())
506 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; 498 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
507 } else if (rcu_preempt_running_reader() < 0 &&
508 t->rcu_read_unlock_special) {
509 /*
510 * Complete exit from RCU read-side critical section on
511 * behalf of preempted instance of __rcu_read_unlock().
512 */
513 rcu_read_unlock_special(t);
514 } 499 }
515 500
516 /* 501 /*
@@ -527,19 +512,28 @@ void rcu_preempt_note_context_switch(void)
527} 512}
528 513
529/* 514/*
515 * Tiny-preemptible RCU implementation for rcu_read_lock().
516 * Just increment ->rcu_read_lock_nesting, shared state will be updated
517 * if we block.
518 */
519void __rcu_read_lock(void)
520{
521 current->rcu_read_lock_nesting++;
522 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
523}
524EXPORT_SYMBOL_GPL(__rcu_read_lock);
525
526/*
530 * Handle special cases during rcu_read_unlock(), such as needing to 527 * Handle special cases during rcu_read_unlock(), such as needing to
531 * notify RCU core processing or task having blocked during the RCU 528 * notify RCU core processing or task having blocked during the RCU
532 * read-side critical section. 529 * read-side critical section.
533 */ 530 */
534void rcu_read_unlock_special(struct task_struct *t) 531static void rcu_read_unlock_special(struct task_struct *t)
535{ 532{
536 int empty; 533 int empty;
537 int empty_exp; 534 int empty_exp;
538 unsigned long flags; 535 unsigned long flags;
539 struct list_head *np; 536 struct list_head *np;
540#ifdef CONFIG_RCU_BOOST
541 struct rt_mutex *rbmp = NULL;
542#endif /* #ifdef CONFIG_RCU_BOOST */
543 int special; 537 int special;
544 538
545 /* 539 /*
@@ -560,7 +554,7 @@ void rcu_read_unlock_special(struct task_struct *t)
560 rcu_preempt_cpu_qs(); 554 rcu_preempt_cpu_qs();
561 555
562 /* Hardware IRQ handlers cannot block. */ 556 /* Hardware IRQ handlers cannot block. */
563 if (in_irq() || in_serving_softirq()) { 557 if (in_irq()) {
564 local_irq_restore(flags); 558 local_irq_restore(flags);
565 return; 559 return;
566 } 560 }
@@ -605,16 +599,39 @@ void rcu_read_unlock_special(struct task_struct *t)
605 } 599 }
606#ifdef CONFIG_RCU_BOOST 600#ifdef CONFIG_RCU_BOOST
607 /* Unboost self if was boosted. */ 601 /* Unboost self if was boosted. */
608 if (t->rcu_boost_mutex != NULL) { 602 if (special & RCU_READ_UNLOCK_BOOSTED) {
609 rbmp = t->rcu_boost_mutex; 603 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
604 rt_mutex_unlock(t->rcu_boost_mutex);
610 t->rcu_boost_mutex = NULL; 605 t->rcu_boost_mutex = NULL;
611 rt_mutex_unlock(rbmp);
612 } 606 }
613#endif /* #ifdef CONFIG_RCU_BOOST */ 607#endif /* #ifdef CONFIG_RCU_BOOST */
614 local_irq_restore(flags); 608 local_irq_restore(flags);
615} 609}
616 610
617/* 611/*
612 * Tiny-preemptible RCU implementation for rcu_read_unlock().
613 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
614 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
615 * invoke rcu_read_unlock_special() to clean up after a context switch
616 * in an RCU read-side critical section and other special cases.
617 */
618void __rcu_read_unlock(void)
619{
620 struct task_struct *t = current;
621
622 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
623 --t->rcu_read_lock_nesting;
624 barrier(); /* decrement before load of ->rcu_read_unlock_special */
625 if (t->rcu_read_lock_nesting == 0 &&
626 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
627 rcu_read_unlock_special(t);
628#ifdef CONFIG_PROVE_LOCKING
629 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
630#endif /* #ifdef CONFIG_PROVE_LOCKING */
631}
632EXPORT_SYMBOL_GPL(__rcu_read_unlock);
633
634/*
618 * Check for a quiescent state from the current CPU. When a task blocks, 635 * Check for a quiescent state from the current CPU. When a task blocks,
619 * the task is recorded in the rcu_preempt_ctrlblk structure, which is 636 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
620 * checked elsewhere. This is called from the scheduling-clock interrupt. 637 * checked elsewhere. This is called from the scheduling-clock interrupt.
@@ -631,10 +648,10 @@ static void rcu_preempt_check_callbacks(void)
631 rcu_preempt_cpu_qs(); 648 rcu_preempt_cpu_qs();
632 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 649 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
633 rcu_preempt_ctrlblk.rcb.donetail) 650 rcu_preempt_ctrlblk.rcb.donetail)
634 invoke_rcu_callbacks(); 651 invoke_rcu_kthread();
635 if (rcu_preempt_gp_in_progress() && 652 if (rcu_preempt_gp_in_progress() &&
636 rcu_cpu_blocking_cur_gp() && 653 rcu_cpu_blocking_cur_gp() &&
637 rcu_preempt_running_reader() > 0) 654 rcu_preempt_running_reader())
638 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 655 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
639} 656}
640 657
@@ -657,7 +674,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
657 */ 674 */
658static void rcu_preempt_process_callbacks(void) 675static void rcu_preempt_process_callbacks(void)
659{ 676{
660 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 677 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
661} 678}
662 679
663/* 680/*
@@ -680,6 +697,20 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
680} 697}
681EXPORT_SYMBOL_GPL(call_rcu); 698EXPORT_SYMBOL_GPL(call_rcu);
682 699
700void rcu_barrier(void)
701{
702 struct rcu_synchronize rcu;
703
704 init_rcu_head_on_stack(&rcu.head);
705 init_completion(&rcu.completion);
706 /* Will wake me after RCU finished. */
707 call_rcu(&rcu.head, wakeme_after_rcu);
708 /* Wait for it. */
709 wait_for_completion(&rcu.completion);
710 destroy_rcu_head_on_stack(&rcu.head);
711}
712EXPORT_SYMBOL_GPL(rcu_barrier);
713
683/* 714/*
684 * synchronize_rcu - wait until a grace period has elapsed. 715 * synchronize_rcu - wait until a grace period has elapsed.
685 * 716 *
@@ -691,11 +722,6 @@ EXPORT_SYMBOL_GPL(call_rcu);
691 */ 722 */
692void synchronize_rcu(void) 723void synchronize_rcu(void)
693{ 724{
694 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
695 !lock_is_held(&rcu_lock_map) &&
696 !lock_is_held(&rcu_sched_lock_map),
697 "Illegal synchronize_rcu() in RCU read-side critical section");
698
699#ifdef CONFIG_DEBUG_LOCK_ALLOC 725#ifdef CONFIG_DEBUG_LOCK_ALLOC
700 if (!rcu_scheduler_active) 726 if (!rcu_scheduler_active)
701 return; 727 return;
@@ -706,10 +732,7 @@ void synchronize_rcu(void)
706 return; 732 return;
707 733
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 734 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 if (rcu_expedited) 735 rcu_barrier();
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
713} 736}
714EXPORT_SYMBOL_GPL(synchronize_rcu); 737EXPORT_SYMBOL_GPL(synchronize_rcu);
715 738
@@ -782,9 +805,9 @@ void synchronize_rcu_expedited(void)
782 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
783 806
784 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
785 if (!rcu_preempted_readers_exp()) { 808 if (!rcu_preempted_readers_exp())
786 local_irq_restore(flags); 809 local_irq_restore(flags);
787 } else { 810 else {
788 rcu_initiate_boost(); 811 rcu_initiate_boost();
789 local_irq_restore(flags); 812 local_irq_restore(flags);
790 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
@@ -805,9 +828,27 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
805 */ 828 */
806int rcu_preempt_needs_cpu(void) 829int rcu_preempt_needs_cpu(void)
807{ 830{
831 if (!rcu_preempt_running_reader())
832 rcu_preempt_cpu_qs();
808 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 833 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
809} 834}
810 835
836/*
837 * Check for a task exiting while in a preemptible -RCU read-side
838 * critical section, clean up if so. No need to issue warnings,
839 * as debug_check_no_locks_held() already does this if lockdep
840 * is enabled.
841 */
842void exit_rcu(void)
843{
844 struct task_struct *t = current;
845
846 if (t->rcu_read_lock_nesting == 0)
847 return;
848 t->rcu_read_lock_nesting = 1;
849 __rcu_read_unlock();
850}
851
811#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 852#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
812 853
813#ifdef CONFIG_RCU_TRACE 854#ifdef CONFIG_RCU_TRACE
@@ -823,6 +864,15 @@ static void show_tiny_preempt_stats(struct seq_file *m)
823#endif /* #ifdef CONFIG_RCU_TRACE */ 864#endif /* #ifdef CONFIG_RCU_TRACE */
824 865
825/* 866/*
867 * Because preemptible RCU does not exist, it is never necessary to
868 * boost preempted RCU readers.
869 */
870static int rcu_boost(void)
871{
872 return 0;
873}
874
875/*
826 * Because preemptible RCU does not exist, it never has any callbacks 876 * Because preemptible RCU does not exist, it never has any callbacks
827 * to check. 877 * to check.
828 */ 878 */
@@ -848,112 +898,6 @@ static void rcu_preempt_process_callbacks(void)
848 898
849#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 899#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
850 900
851#ifdef CONFIG_RCU_BOOST
852
853/*
854 * Wake up rcu_kthread() to process callbacks now eligible for invocation
855 * or to boost readers.
856 */
857static void invoke_rcu_callbacks(void)
858{
859 have_rcu_kthread_work = 1;
860 if (rcu_kthread_task != NULL)
861 wake_up(&rcu_kthread_wq);
862}
863
864#ifdef CONFIG_RCU_TRACE
865
866/*
867 * Is the current CPU running the RCU-callbacks kthread?
868 * Caller must have preemption disabled.
869 */
870static bool rcu_is_callbacks_kthread(void)
871{
872 return rcu_kthread_task == current;
873}
874
875#endif /* #ifdef CONFIG_RCU_TRACE */
876
877/*
878 * This kthread invokes RCU callbacks whose grace periods have
879 * elapsed. It is awakened as needed, and takes the place of the
880 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
881 * This is a kthread, but it is never stopped, at least not until
882 * the system goes down.
883 */
884static int rcu_kthread(void *arg)
885{
886 unsigned long work;
887 unsigned long morework;
888 unsigned long flags;
889
890 for (;;) {
891 wait_event_interruptible(rcu_kthread_wq,
892 have_rcu_kthread_work != 0);
893 morework = rcu_boost();
894 local_irq_save(flags);
895 work = have_rcu_kthread_work;
896 have_rcu_kthread_work = morework;
897 local_irq_restore(flags);
898 if (work)
899 rcu_process_callbacks(NULL);
900 schedule_timeout_interruptible(1); /* Leave CPU for others. */
901 }
902
903 return 0; /* Not reached, but needed to shut gcc up. */
904}
905
906/*
907 * Spawn the kthread that invokes RCU callbacks.
908 */
909static int __init rcu_spawn_kthreads(void)
910{
911 struct sched_param sp;
912
913 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
914 sp.sched_priority = RCU_BOOST_PRIO;
915 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
916 return 0;
917}
918early_initcall(rcu_spawn_kthreads);
919
920#else /* #ifdef CONFIG_RCU_BOOST */
921
922/* Hold off callback invocation until early_initcall() time. */
923static int rcu_scheduler_fully_active __read_mostly;
924
925/*
926 * Start up softirq processing of callbacks.
927 */
928void invoke_rcu_callbacks(void)
929{
930 if (rcu_scheduler_fully_active)
931 raise_softirq(RCU_SOFTIRQ);
932}
933
934#ifdef CONFIG_RCU_TRACE
935
936/*
937 * There is no callback kthread, so this thread is never it.
938 */
939static bool rcu_is_callbacks_kthread(void)
940{
941 return false;
942}
943
944#endif /* #ifdef CONFIG_RCU_TRACE */
945
946static int __init rcu_scheduler_really_started(void)
947{
948 rcu_scheduler_fully_active = 1;
949 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
950 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
951 return 0;
952}
953early_initcall(rcu_scheduler_really_started);
954
955#endif /* #else #ifdef CONFIG_RCU_BOOST */
956
957#ifdef CONFIG_DEBUG_LOCK_ALLOC 901#ifdef CONFIG_DEBUG_LOCK_ALLOC
958#include <linux/kernel_stat.h> 902#include <linux/kernel_stat.h>
959 903
@@ -969,6 +913,12 @@ void __init rcu_scheduler_starting(void)
969 913
970#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 914#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
971 915
916#ifdef CONFIG_RCU_BOOST
917#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
918#else /* #ifdef CONFIG_RCU_BOOST */
919#define RCU_BOOST_PRIO 1
920#endif /* #else #ifdef CONFIG_RCU_BOOST */
921
972#ifdef CONFIG_RCU_TRACE 922#ifdef CONFIG_RCU_TRACE
973 923
974#ifdef CONFIG_RCU_BOOST 924#ifdef CONFIG_RCU_BOOST
@@ -994,9 +944,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
994{ 944{
995 unsigned long flags; 945 unsigned long flags;
996 946
997 local_irq_save(flags); 947 raw_local_irq_save(flags);
998 rcp->qlen -= n; 948 rcp->qlen -= n;
999 local_irq_restore(flags); 949 raw_local_irq_restore(flags);
1000} 950}
1001 951
1002/* 952/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85f..98f51b13bb7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,27 +49,21 @@
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>");
53 54
54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
55static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
56static int stat_interval = 60; /* Interval between stats, in seconds. */ 57static int stat_interval; /* Interval between stats, in seconds. */
57 /* Zero means "only at end of test". */ 58 /* Defaults to "only at end of test". */
58static bool verbose; /* Print more debug info. */ 59static int verbose; /* Print more debug info. */
59static bool test_no_idle_hz = true; 60static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
60 /* Test RCU support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
71static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
72static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
73static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
74static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
75static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -79,7 +73,7 @@ module_param(nreaders, int, 0444);
79MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 73MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
80module_param(nfakewriters, int, 0444); 74module_param(nfakewriters, int, 0444);
81MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 75MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
82module_param(stat_interval, int, 0644); 76module_param(stat_interval, int, 0444);
83MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 77MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
84module_param(verbose, bool, 0444); 78module_param(verbose, bool, 0444);
85MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 79MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -97,18 +91,6 @@ module_param(fqs_holdoff, int, 0444);
97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
98module_param(fqs_stutter, int, 0444); 92module_param(fqs_stutter, int, 0444);
99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
102module_param(onoff_interval, int, 0444);
103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
104module_param(onoff_holdoff, int, 0444);
105MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
106module_param(shutdown_secs, int, 0444);
107MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
108module_param(stall_cpu, int, 0444);
109MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
110module_param(stall_cpu_holdoff, int, 0444);
111MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
112module_param(test_boost, int, 0444); 94module_param(test_boost, int, 0444);
113MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
114module_param(test_boost_interval, int, 0444); 96module_param(test_boost_interval, int, 0444);
@@ -120,11 +102,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
120 102
121#define TORTURE_FLAG "-torture:" 103#define TORTURE_FLAG "-torture:"
122#define PRINTK_STRING(s) \ 104#define PRINTK_STRING(s) \
123 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) 105 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
124#define VERBOSE_PRINTK_STRING(s) \ 106#define VERBOSE_PRINTK_STRING(s) \
125 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) 107 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
126#define VERBOSE_PRINTK_ERRSTRING(s) \ 108#define VERBOSE_PRINTK_ERRSTRING(s) \
127 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 109 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
128 110
129static char printk_buf[4096]; 111static char printk_buf[4096];
130 112
@@ -137,13 +119,6 @@ static struct task_struct *shuffler_task;
137static struct task_struct *stutter_task; 119static struct task_struct *stutter_task;
138static struct task_struct *fqs_task; 120static struct task_struct *fqs_task;
139static struct task_struct *boost_tasks[NR_CPUS]; 121static struct task_struct *boost_tasks[NR_CPUS];
140static struct task_struct *shutdown_task;
141#ifdef CONFIG_HOTPLUG_CPU
142static struct task_struct *onoff_task;
143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
147 122
148#define RCU_TORTURE_PIPE_LEN 10 123#define RCU_TORTURE_PIPE_LEN 10
149 124
@@ -169,24 +144,11 @@ static atomic_t n_rcu_torture_alloc_fail;
169static atomic_t n_rcu_torture_free; 144static atomic_t n_rcu_torture_free;
170static atomic_t n_rcu_torture_mberror; 145static atomic_t n_rcu_torture_mberror;
171static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
173static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
174static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
175static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
176static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
177static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
178static long n_offline_attempts;
179static long n_offline_successes;
180static unsigned long sum_offline;
181static int min_offline = -1;
182static int max_offline;
183static long n_online_attempts;
184static long n_online_successes;
185static unsigned long sum_online;
186static int min_online = -1;
187static int max_online;
188static long n_barrier_attempts;
189static long n_barrier_successes;
190static struct list_head rcu_torture_removed; 152static struct list_head rcu_torture_removed;
191static cpumask_var_t shuffle_tmp_mask; 153static cpumask_var_t shuffle_tmp_mask;
192 154
@@ -198,8 +160,6 @@ static int stutter_pause_test;
198#define RCUTORTURE_RUNNABLE_INIT 0 160#define RCUTORTURE_RUNNABLE_INIT 0
199#endif 161#endif
200int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
201module_param(rcutorture_runnable, int, 0444);
202MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
203 163
204#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
205#define rcu_can_boost() 1 165#define rcu_can_boost() 1
@@ -207,15 +167,9 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
207#define rcu_can_boost() 0 167#define rcu_can_boost() 0
208#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
209 169
210static unsigned long shutdown_time; /* jiffies to system shutdown. */
211static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
212DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
213 /* and boost task create/destroy. */ 172 /* and boost task create/destroy. */
214static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
215static bool barrier_phase; /* Test phase. */
216static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
217static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
218static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
219 173
220/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 174/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
221 175
@@ -228,9 +182,6 @@ static int fullstop = FULLSTOP_RMMOD;
228 */ 182 */
229static DEFINE_MUTEX(fullstop_mutex); 183static DEFINE_MUTEX(fullstop_mutex);
230 184
231/* Forward reference. */
232static void rcu_torture_cleanup(void);
233
234/* 185/*
235 * Detect and respond to a system shutdown. 186 * Detect and respond to a system shutdown.
236 */ 187 */
@@ -242,7 +193,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
242 if (fullstop == FULLSTOP_DONTSTOP) 193 if (fullstop == FULLSTOP_DONTSTOP)
243 fullstop = FULLSTOP_SHUTDOWN; 194 fullstop = FULLSTOP_SHUTDOWN;
244 else 195 else
245 pr_warn(/* but going down anyway, so... */ 196 printk(KERN_WARNING /* but going down anyway, so... */
246 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 197 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
247 mutex_unlock(&fullstop_mutex); 198 mutex_unlock(&fullstop_mutex);
248 return NOTIFY_DONE; 199 return NOTIFY_DONE;
@@ -255,7 +206,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
255static void rcutorture_shutdown_absorb(char *title) 206static void rcutorture_shutdown_absorb(char *title)
256{ 207{
257 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 208 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
258 pr_notice( 209 printk(KERN_NOTICE
259 "rcutorture thread %s parking due to system shutdown\n", 210 "rcutorture thread %s parking due to system shutdown\n",
260 title); 211 title);
261 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); 212 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -339,13 +290,13 @@ rcu_stutter_wait(char *title)
339 290
340struct rcu_torture_ops { 291struct rcu_torture_ops {
341 void (*init)(void); 292 void (*init)(void);
293 void (*cleanup)(void);
342 int (*readlock)(void); 294 int (*readlock)(void);
343 void (*read_delay)(struct rcu_random_state *rrsp); 295 void (*read_delay)(struct rcu_random_state *rrsp);
344 void (*readunlock)(int idx); 296 void (*readunlock)(int idx);
345 int (*completed)(void); 297 int (*completed)(void);
346 void (*deferred_free)(struct rcu_torture *p); 298 void (*deferred_free)(struct rcu_torture *p);
347 void (*sync)(void); 299 void (*sync)(void);
348 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
349 void (*cb_barrier)(void); 300 void (*cb_barrier)(void);
350 void (*fqs)(void); 301 void (*fqs)(void);
351 int (*stats)(char *page); 302 int (*stats)(char *page);
@@ -413,9 +364,8 @@ rcu_torture_cb(struct rcu_head *p)
413 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 364 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
414 rp->rtort_mbtest = 0; 365 rp->rtort_mbtest = 0;
415 rcu_torture_free(rp); 366 rcu_torture_free(rp);
416 } else { 367 } else
417 cur_ops->deferred_free(rp); 368 cur_ops->deferred_free(rp);
418 }
419} 369}
420 370
421static int rcu_no_completed(void) 371static int rcu_no_completed(void)
@@ -430,13 +380,13 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
430 380
431static struct rcu_torture_ops rcu_ops = { 381static struct rcu_torture_ops rcu_ops = {
432 .init = NULL, 382 .init = NULL,
383 .cleanup = NULL,
433 .readlock = rcu_torture_read_lock, 384 .readlock = rcu_torture_read_lock,
434 .read_delay = rcu_read_delay, 385 .read_delay = rcu_read_delay,
435 .readunlock = rcu_torture_read_unlock, 386 .readunlock = rcu_torture_read_unlock,
436 .completed = rcu_torture_completed, 387 .completed = rcu_torture_completed,
437 .deferred_free = rcu_torture_deferred_free, 388 .deferred_free = rcu_torture_deferred_free,
438 .sync = synchronize_rcu, 389 .sync = synchronize_rcu,
439 .call = call_rcu,
440 .cb_barrier = rcu_barrier, 390 .cb_barrier = rcu_barrier,
441 .fqs = rcu_force_quiescent_state, 391 .fqs = rcu_force_quiescent_state,
442 .stats = NULL, 392 .stats = NULL,
@@ -473,13 +423,13 @@ static void rcu_sync_torture_init(void)
473 423
474static struct rcu_torture_ops rcu_sync_ops = { 424static struct rcu_torture_ops rcu_sync_ops = {
475 .init = rcu_sync_torture_init, 425 .init = rcu_sync_torture_init,
426 .cleanup = NULL,
476 .readlock = rcu_torture_read_lock, 427 .readlock = rcu_torture_read_lock,
477 .read_delay = rcu_read_delay, 428 .read_delay = rcu_read_delay,
478 .readunlock = rcu_torture_read_unlock, 429 .readunlock = rcu_torture_read_unlock,
479 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
480 .deferred_free = rcu_sync_torture_deferred_free, 431 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
482 .call = NULL,
483 .cb_barrier = NULL, 433 .cb_barrier = NULL,
484 .fqs = rcu_force_quiescent_state, 434 .fqs = rcu_force_quiescent_state,
485 .stats = NULL, 435 .stats = NULL,
@@ -490,13 +440,13 @@ static struct rcu_torture_ops rcu_sync_ops = {
490 440
491static struct rcu_torture_ops rcu_expedited_ops = { 441static struct rcu_torture_ops rcu_expedited_ops = {
492 .init = rcu_sync_torture_init, 442 .init = rcu_sync_torture_init,
443 .cleanup = NULL,
493 .readlock = rcu_torture_read_lock, 444 .readlock = rcu_torture_read_lock,
494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 445 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
495 .readunlock = rcu_torture_read_unlock, 446 .readunlock = rcu_torture_read_unlock,
496 .completed = rcu_no_completed, 447 .completed = rcu_no_completed,
497 .deferred_free = rcu_sync_torture_deferred_free, 448 .deferred_free = rcu_sync_torture_deferred_free,
498 .sync = synchronize_rcu_expedited, 449 .sync = synchronize_rcu_expedited,
499 .call = NULL,
500 .cb_barrier = NULL, 450 .cb_barrier = NULL,
501 .fqs = rcu_force_quiescent_state, 451 .fqs = rcu_force_quiescent_state,
502 .stats = NULL, 452 .stats = NULL,
@@ -530,15 +480,39 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
530 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
531} 481}
532 482
483struct rcu_bh_torture_synchronize {
484 struct rcu_head head;
485 struct completion completion;
486};
487
488static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
489{
490 struct rcu_bh_torture_synchronize *rcu;
491
492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
493 complete(&rcu->completion);
494}
495
496static void rcu_bh_torture_synchronize(void)
497{
498 struct rcu_bh_torture_synchronize rcu;
499
500 init_rcu_head_on_stack(&rcu.head);
501 init_completion(&rcu.completion);
502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
503 wait_for_completion(&rcu.completion);
504 destroy_rcu_head_on_stack(&rcu.head);
505}
506
533static struct rcu_torture_ops rcu_bh_ops = { 507static struct rcu_torture_ops rcu_bh_ops = {
534 .init = NULL, 508 .init = NULL,
509 .cleanup = NULL,
535 .readlock = rcu_bh_torture_read_lock, 510 .readlock = rcu_bh_torture_read_lock,
536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 511 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
537 .readunlock = rcu_bh_torture_read_unlock, 512 .readunlock = rcu_bh_torture_read_unlock,
538 .completed = rcu_bh_torture_completed, 513 .completed = rcu_bh_torture_completed,
539 .deferred_free = rcu_bh_torture_deferred_free, 514 .deferred_free = rcu_bh_torture_deferred_free,
540 .sync = synchronize_rcu_bh, 515 .sync = rcu_bh_torture_synchronize,
541 .call = call_rcu_bh,
542 .cb_barrier = rcu_barrier_bh, 516 .cb_barrier = rcu_barrier_bh,
543 .fqs = rcu_bh_force_quiescent_state, 517 .fqs = rcu_bh_force_quiescent_state,
544 .stats = NULL, 518 .stats = NULL,
@@ -548,13 +522,13 @@ static struct rcu_torture_ops rcu_bh_ops = {
548 522
549static struct rcu_torture_ops rcu_bh_sync_ops = { 523static struct rcu_torture_ops rcu_bh_sync_ops = {
550 .init = rcu_sync_torture_init, 524 .init = rcu_sync_torture_init,
525 .cleanup = NULL,
551 .readlock = rcu_bh_torture_read_lock, 526 .readlock = rcu_bh_torture_read_lock,
552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 527 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
553 .readunlock = rcu_bh_torture_read_unlock, 528 .readunlock = rcu_bh_torture_read_unlock,
554 .completed = rcu_bh_torture_completed, 529 .completed = rcu_bh_torture_completed,
555 .deferred_free = rcu_sync_torture_deferred_free, 530 .deferred_free = rcu_sync_torture_deferred_free,
556 .sync = synchronize_rcu_bh, 531 .sync = rcu_bh_torture_synchronize,
557 .call = NULL,
558 .cb_barrier = NULL, 532 .cb_barrier = NULL,
559 .fqs = rcu_bh_force_quiescent_state, 533 .fqs = rcu_bh_force_quiescent_state,
560 .stats = NULL, 534 .stats = NULL,
@@ -562,27 +536,23 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
562 .name = "rcu_bh_sync" 536 .name = "rcu_bh_sync"
563}; 537};
564 538
565static struct rcu_torture_ops rcu_bh_expedited_ops = {
566 .init = rcu_sync_torture_init,
567 .readlock = rcu_bh_torture_read_lock,
568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
569 .readunlock = rcu_bh_torture_read_unlock,
570 .completed = rcu_bh_torture_completed,
571 .deferred_free = rcu_sync_torture_deferred_free,
572 .sync = synchronize_rcu_bh_expedited,
573 .call = NULL,
574 .cb_barrier = NULL,
575 .fqs = rcu_bh_force_quiescent_state,
576 .stats = NULL,
577 .irq_capable = 1,
578 .name = "rcu_bh_expedited"
579};
580
581/* 539/*
582 * Definitions for srcu torture testing. 540 * Definitions for srcu torture testing.
583 */ 541 */
584 542
585DEFINE_STATIC_SRCU(srcu_ctl); 543static struct srcu_struct srcu_ctl;
544
545static void srcu_torture_init(void)
546{
547 init_srcu_struct(&srcu_ctl);
548 rcu_sync_torture_init();
549}
550
551static void srcu_torture_cleanup(void)
552{
553 synchronize_srcu(&srcu_ctl);
554 cleanup_srcu_struct(&srcu_ctl);
555}
586 556
587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 557static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
588{ 558{
@@ -614,27 +584,11 @@ static int srcu_torture_completed(void)
614 return srcu_batches_completed(&srcu_ctl); 584 return srcu_batches_completed(&srcu_ctl);
615} 585}
616 586
617static void srcu_torture_deferred_free(struct rcu_torture *rp)
618{
619 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
620}
621
622static void srcu_torture_synchronize(void) 587static void srcu_torture_synchronize(void)
623{ 588{
624 synchronize_srcu(&srcu_ctl); 589 synchronize_srcu(&srcu_ctl);
625} 590}
626 591
627static void srcu_torture_call(struct rcu_head *head,
628 void (*func)(struct rcu_head *head))
629{
630 call_srcu(&srcu_ctl, head, func);
631}
632
633static void srcu_torture_barrier(void)
634{
635 srcu_barrier(&srcu_ctl);
636}
637
638static int srcu_torture_stats(char *page) 592static int srcu_torture_stats(char *page)
639{ 593{
640 int cnt = 0; 594 int cnt = 0;
@@ -644,7 +598,7 @@ static int srcu_torture_stats(char *page)
644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 598 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
645 torture_type, TORTURE_FLAG, idx); 599 torture_type, TORTURE_FLAG, idx);
646 for_each_possible_cpu(cpu) { 600 for_each_possible_cpu(cpu) {
647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, 601 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 602 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 603 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
650 } 604 }
@@ -653,69 +607,17 @@ static int srcu_torture_stats(char *page)
653} 607}
654 608
655static struct rcu_torture_ops srcu_ops = { 609static struct rcu_torture_ops srcu_ops = {
656 .init = rcu_sync_torture_init, 610 .init = srcu_torture_init,
611 .cleanup = srcu_torture_cleanup,
657 .readlock = srcu_torture_read_lock, 612 .readlock = srcu_torture_read_lock,
658 .read_delay = srcu_read_delay, 613 .read_delay = srcu_read_delay,
659 .readunlock = srcu_torture_read_unlock, 614 .readunlock = srcu_torture_read_unlock,
660 .completed = srcu_torture_completed, 615 .completed = srcu_torture_completed,
661 .deferred_free = srcu_torture_deferred_free,
662 .sync = srcu_torture_synchronize,
663 .call = srcu_torture_call,
664 .cb_barrier = srcu_torture_barrier,
665 .stats = srcu_torture_stats,
666 .name = "srcu"
667};
668
669static struct rcu_torture_ops srcu_sync_ops = {
670 .init = rcu_sync_torture_init,
671 .readlock = srcu_torture_read_lock,
672 .read_delay = srcu_read_delay,
673 .readunlock = srcu_torture_read_unlock,
674 .completed = srcu_torture_completed,
675 .deferred_free = rcu_sync_torture_deferred_free,
676 .sync = srcu_torture_synchronize,
677 .call = NULL,
678 .cb_barrier = NULL,
679 .stats = srcu_torture_stats,
680 .name = "srcu_sync"
681};
682
683static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
684{
685 return srcu_read_lock_raw(&srcu_ctl);
686}
687
688static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
689{
690 srcu_read_unlock_raw(&srcu_ctl, idx);
691}
692
693static struct rcu_torture_ops srcu_raw_ops = {
694 .init = rcu_sync_torture_init,
695 .readlock = srcu_torture_read_lock_raw,
696 .read_delay = srcu_read_delay,
697 .readunlock = srcu_torture_read_unlock_raw,
698 .completed = srcu_torture_completed,
699 .deferred_free = srcu_torture_deferred_free,
700 .sync = srcu_torture_synchronize,
701 .call = NULL,
702 .cb_barrier = NULL,
703 .stats = srcu_torture_stats,
704 .name = "srcu_raw"
705};
706
707static struct rcu_torture_ops srcu_raw_sync_ops = {
708 .init = rcu_sync_torture_init,
709 .readlock = srcu_torture_read_lock_raw,
710 .read_delay = srcu_read_delay,
711 .readunlock = srcu_torture_read_unlock_raw,
712 .completed = srcu_torture_completed,
713 .deferred_free = rcu_sync_torture_deferred_free, 616 .deferred_free = rcu_sync_torture_deferred_free,
714 .sync = srcu_torture_synchronize, 617 .sync = srcu_torture_synchronize,
715 .call = NULL,
716 .cb_barrier = NULL, 618 .cb_barrier = NULL,
717 .stats = srcu_torture_stats, 619 .stats = srcu_torture_stats,
718 .name = "srcu_raw_sync" 620 .name = "srcu"
719}; 621};
720 622
721static void srcu_torture_synchronize_expedited(void) 623static void srcu_torture_synchronize_expedited(void)
@@ -724,14 +626,14 @@ static void srcu_torture_synchronize_expedited(void)
724} 626}
725 627
726static struct rcu_torture_ops srcu_expedited_ops = { 628static struct rcu_torture_ops srcu_expedited_ops = {
727 .init = rcu_sync_torture_init, 629 .init = srcu_torture_init,
630 .cleanup = srcu_torture_cleanup,
728 .readlock = srcu_torture_read_lock, 631 .readlock = srcu_torture_read_lock,
729 .read_delay = srcu_read_delay, 632 .read_delay = srcu_read_delay,
730 .readunlock = srcu_torture_read_unlock, 633 .readunlock = srcu_torture_read_unlock,
731 .completed = srcu_torture_completed, 634 .completed = srcu_torture_completed,
732 .deferred_free = rcu_sync_torture_deferred_free, 635 .deferred_free = rcu_sync_torture_deferred_free,
733 .sync = srcu_torture_synchronize_expedited, 636 .sync = srcu_torture_synchronize_expedited,
734 .call = NULL,
735 .cb_barrier = NULL, 637 .cb_barrier = NULL,
736 .stats = srcu_torture_stats, 638 .stats = srcu_torture_stats,
737 .name = "srcu_expedited" 639 .name = "srcu_expedited"
@@ -757,14 +659,20 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
757 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
758} 660}
759 661
662static void sched_torture_synchronize(void)
663{
664 synchronize_sched();
665}
666
760static struct rcu_torture_ops sched_ops = { 667static struct rcu_torture_ops sched_ops = {
761 .init = rcu_sync_torture_init, 668 .init = rcu_sync_torture_init,
669 .cleanup = NULL,
762 .readlock = sched_torture_read_lock, 670 .readlock = sched_torture_read_lock,
763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 671 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
764 .readunlock = sched_torture_read_unlock, 672 .readunlock = sched_torture_read_unlock,
765 .completed = rcu_no_completed, 673 .completed = rcu_no_completed,
766 .deferred_free = rcu_sched_torture_deferred_free, 674 .deferred_free = rcu_sched_torture_deferred_free,
767 .sync = synchronize_sched, 675 .sync = sched_torture_synchronize,
768 .cb_barrier = rcu_barrier_sched, 676 .cb_barrier = rcu_barrier_sched,
769 .fqs = rcu_sched_force_quiescent_state, 677 .fqs = rcu_sched_force_quiescent_state,
770 .stats = NULL, 678 .stats = NULL,
@@ -774,12 +682,13 @@ static struct rcu_torture_ops sched_ops = {
774 682
775static struct rcu_torture_ops sched_sync_ops = { 683static struct rcu_torture_ops sched_sync_ops = {
776 .init = rcu_sync_torture_init, 684 .init = rcu_sync_torture_init,
685 .cleanup = NULL,
777 .readlock = sched_torture_read_lock, 686 .readlock = sched_torture_read_lock,
778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 687 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
779 .readunlock = sched_torture_read_unlock, 688 .readunlock = sched_torture_read_unlock,
780 .completed = rcu_no_completed, 689 .completed = rcu_no_completed,
781 .deferred_free = rcu_sync_torture_deferred_free, 690 .deferred_free = rcu_sync_torture_deferred_free,
782 .sync = synchronize_sched, 691 .sync = sched_torture_synchronize,
783 .cb_barrier = NULL, 692 .cb_barrier = NULL,
784 .fqs = rcu_sched_force_quiescent_state, 693 .fqs = rcu_sched_force_quiescent_state,
785 .stats = NULL, 694 .stats = NULL,
@@ -788,6 +697,7 @@ static struct rcu_torture_ops sched_sync_ops = {
788 697
789static struct rcu_torture_ops sched_expedited_ops = { 698static struct rcu_torture_ops sched_expedited_ops = {
790 .init = rcu_sync_torture_init, 699 .init = rcu_sync_torture_init,
700 .cleanup = NULL,
791 .readlock = sched_torture_read_lock, 701 .readlock = sched_torture_read_lock,
792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 702 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
793 .readunlock = sched_torture_read_unlock, 703 .readunlock = sched_torture_read_unlock,
@@ -844,7 +754,7 @@ static int rcu_torture_boost(void *arg)
844 do { 754 do {
845 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
846 oldstarttime = boost_starttime; 756 oldstarttime = boost_starttime;
847 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 757 while (jiffies - oldstarttime > ULONG_MAX / 2) {
848 schedule_timeout_uninterruptible(1); 758 schedule_timeout_uninterruptible(1);
849 rcu_stutter_wait("rcu_torture_boost"); 759 rcu_stutter_wait("rcu_torture_boost");
850 if (kthread_should_stop() || 760 if (kthread_should_stop() ||
@@ -855,7 +765,7 @@ static int rcu_torture_boost(void *arg)
855 /* Do one boost-test interval. */ 765 /* Do one boost-test interval. */
856 endtime = oldstarttime + test_boost_duration * HZ; 766 endtime = oldstarttime + test_boost_duration * HZ;
857 call_rcu_time = jiffies; 767 call_rcu_time = jiffies;
858 while (ULONG_CMP_LT(jiffies, endtime)) { 768 while (jiffies - endtime > ULONG_MAX / 2) {
859 /* If we don't have a callback in flight, post one. */ 769 /* If we don't have a callback in flight, post one. */
860 if (!rbi.inflight) { 770 if (!rbi.inflight) {
861 smp_mb(); /* RCU core before ->inflight = 1. */ 771 smp_mb(); /* RCU core before ->inflight = 1. */
@@ -882,8 +792,7 @@ static int rcu_torture_boost(void *arg)
882 * interval. Besides, we are running at RT priority, 792 * interval. Besides, we are running at RT priority,
883 * so delays should be relatively rare. 793 * so delays should be relatively rare.
884 */ 794 */
885 while (oldstarttime == boost_starttime && 795 while (oldstarttime == boost_starttime) {
886 !kthread_should_stop()) {
887 if (mutex_trylock(&boost_mutex)) { 796 if (mutex_trylock(&boost_mutex)) {
888 boost_starttime = jiffies + 797 boost_starttime = jiffies +
889 test_boost_interval * HZ; 798 test_boost_interval * HZ;
@@ -900,11 +809,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
900 809
901 /* Clean up and exit. */ 810 /* Clean up and exit. */
902 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
903 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
904 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
905 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
906 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
907 destroy_rcu_head_on_stack(&rbi.rcu);
908 return 0; 817 return 0;
909} 818}
910 819
@@ -922,13 +831,11 @@ rcu_torture_fqs(void *arg)
922 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
923 do { 832 do {
924 fqs_resume_time = jiffies + fqs_stutter * HZ; 833 fqs_resume_time = jiffies + fqs_stutter * HZ;
925 while (ULONG_CMP_LT(jiffies, fqs_resume_time) && 834 while (jiffies - fqs_resume_time > LONG_MAX) {
926 !kthread_should_stop()) {
927 schedule_timeout_interruptible(1); 835 schedule_timeout_interruptible(1);
928 } 836 }
929 fqs_burst_remaining = fqs_duration; 837 fqs_burst_remaining = fqs_duration;
930 while (fqs_burst_remaining > 0 && 838 while (fqs_burst_remaining > 0) {
931 !kthread_should_stop()) {
932 cur_ops->fqs(); 839 cur_ops->fqs();
933 udelay(fqs_holdoff); 840 udelay(fqs_holdoff);
934 fqs_burst_remaining -= fqs_holdoff; 841 fqs_burst_remaining -= fqs_holdoff;
@@ -1005,11 +912,7 @@ rcu_torture_fakewriter(void *arg)
1005 do { 912 do {
1006 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 913 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
1007 udelay(rcu_random(&rand) & 0x3ff); 914 udelay(rcu_random(&rand) & 0x3ff);
1008 if (cur_ops->cb_barrier != NULL && 915 cur_ops->sync();
1009 rcu_random(&rand) % (nfakewriters * 8) == 0)
1010 cur_ops->cb_barrier();
1011 else
1012 cur_ops->sync();
1013 rcu_stutter_wait("rcu_torture_fakewriter"); 916 rcu_stutter_wait("rcu_torture_fakewriter");
1014 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 917 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1015 918
@@ -1020,18 +923,6 @@ rcu_torture_fakewriter(void *arg)
1020 return 0; 923 return 0;
1021} 924}
1022 925
1023void rcutorture_trace_dump(void)
1024{
1025 static atomic_t beenhere = ATOMIC_INIT(0);
1026
1027 if (atomic_read(&beenhere))
1028 return;
1029 if (atomic_xchg(&beenhere, 1) != 0)
1030 return;
1031 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
1032 ftrace_dump(DUMP_ALL);
1033}
1034
1035/* 926/*
1036 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 927 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
1037 * incrementing the corresponding element of the pipeline array. The 928 * incrementing the corresponding element of the pipeline array. The
@@ -1058,7 +949,6 @@ static void rcu_torture_timer(unsigned long unused)
1058 cur_ops->readunlock(idx); 949 cur_ops->readunlock(idx);
1059 return; 950 return;
1060 } 951 }
1061 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1062 if (p->rtort_mbtest == 0) 952 if (p->rtort_mbtest == 0)
1063 atomic_inc(&n_rcu_torture_mberror); 953 atomic_inc(&n_rcu_torture_mberror);
1064 spin_lock(&rand_lock); 954 spin_lock(&rand_lock);
@@ -1071,8 +961,6 @@ static void rcu_torture_timer(unsigned long unused)
1071 /* Should not happen, but... */ 961 /* Should not happen, but... */
1072 pipe_count = RCU_TORTURE_PIPE_LEN; 962 pipe_count = RCU_TORTURE_PIPE_LEN;
1073 } 963 }
1074 if (pipe_count > 1)
1075 rcutorture_trace_dump();
1076 __this_cpu_inc(rcu_torture_count[pipe_count]); 964 __this_cpu_inc(rcu_torture_count[pipe_count]);
1077 completed = cur_ops->completed() - completed; 965 completed = cur_ops->completed() - completed;
1078 if (completed > RCU_TORTURE_PIPE_LEN) { 966 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1122,7 +1010,6 @@ rcu_torture_reader(void *arg)
1122 schedule_timeout_interruptible(HZ); 1010 schedule_timeout_interruptible(HZ);
1123 continue; 1011 continue;
1124 } 1012 }
1125 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1126 if (p->rtort_mbtest == 0) 1013 if (p->rtort_mbtest == 0)
1127 atomic_inc(&n_rcu_torture_mberror); 1014 atomic_inc(&n_rcu_torture_mberror);
1128 cur_ops->read_delay(&rand); 1015 cur_ops->read_delay(&rand);
@@ -1132,8 +1019,6 @@ rcu_torture_reader(void *arg)
1132 /* Should not happen, but... */ 1019 /* Should not happen, but... */
1133 pipe_count = RCU_TORTURE_PIPE_LEN; 1020 pipe_count = RCU_TORTURE_PIPE_LEN;
1134 } 1021 }
1135 if (pipe_count > 1)
1136 rcutorture_trace_dump();
1137 __this_cpu_inc(rcu_torture_count[pipe_count]); 1022 __this_cpu_inc(rcu_torture_count[pipe_count]);
1138 completed = cur_ops->completed() - completed; 1023 completed = cur_ops->completed() - completed;
1139 if (completed > RCU_TORTURE_PIPE_LEN) { 1024 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1179,39 +1064,28 @@ rcu_torture_printk(char *page)
1179 } 1064 }
1180 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1065 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1181 cnt += sprintf(&page[cnt], 1066 cnt += sprintf(&page[cnt],
1182 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1067 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1068 "rtmbe: %d rtbke: %ld rtbre: %ld "
1069 "rtbf: %ld rtb: %ld nt: %ld",
1183 rcu_torture_current, 1070 rcu_torture_current,
1184 rcu_torture_current_version, 1071 rcu_torture_current_version,
1185 list_empty(&rcu_torture_freelist), 1072 list_empty(&rcu_torture_freelist),
1186 atomic_read(&n_rcu_torture_alloc), 1073 atomic_read(&n_rcu_torture_alloc),
1187 atomic_read(&n_rcu_torture_alloc_fail), 1074 atomic_read(&n_rcu_torture_alloc_fail),
1188 atomic_read(&n_rcu_torture_free)); 1075 atomic_read(&n_rcu_torture_free),
1189 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
1190 atomic_read(&n_rcu_torture_mberror), 1076 atomic_read(&n_rcu_torture_mberror),
1191 n_rcu_torture_boost_ktrerror, 1077 n_rcu_torture_boost_ktrerror,
1192 n_rcu_torture_boost_rterror); 1078 n_rcu_torture_boost_rterror,
1193 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
1194 n_rcu_torture_boost_failure, 1079 n_rcu_torture_boost_failure,
1195 n_rcu_torture_boosts, 1080 n_rcu_torture_boosts,
1196 n_rcu_torture_timers); 1081 n_rcu_torture_timers);
1197 cnt += sprintf(&page[cnt],
1198 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1199 n_online_successes, n_online_attempts,
1200 n_offline_successes, n_offline_attempts,
1201 min_online, max_online,
1202 min_offline, max_offline,
1203 sum_online, sum_offline, HZ);
1204 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1205 n_barrier_successes,
1206 n_barrier_attempts,
1207 n_rcu_torture_barrier_error);
1208 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1209 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1082 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1210 n_rcu_torture_barrier_error != 0 ||
1211 n_rcu_torture_boost_ktrerror != 0 || 1083 n_rcu_torture_boost_ktrerror != 0 ||
1212 n_rcu_torture_boost_rterror != 0 || 1084 n_rcu_torture_boost_rterror != 0 ||
1213 n_rcu_torture_boost_failure != 0 || 1085 n_rcu_torture_boost_failure != 0)
1214 i > 1) { 1086 cnt += sprintf(&page[cnt], " !!!");
1087 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1088 if (i > 1) {
1215 cnt += sprintf(&page[cnt], "!!! "); 1089 cnt += sprintf(&page[cnt], "!!! ");
1216 atomic_inc(&n_rcu_torture_error); 1090 atomic_inc(&n_rcu_torture_error);
1217 WARN_ON_ONCE(1); 1091 WARN_ON_ONCE(1);
@@ -1249,7 +1123,7 @@ rcu_torture_stats_print(void)
1249 int cnt; 1123 int cnt;
1250 1124
1251 cnt = rcu_torture_printk(printk_buf); 1125 cnt = rcu_torture_printk(printk_buf);
1252 pr_alert("%s", printk_buf); 1126 printk(KERN_ALERT "%s", printk_buf);
1253} 1127}
1254 1128
1255/* 1129/*
@@ -1362,24 +1236,18 @@ rcu_torture_stutter(void *arg)
1362static inline void 1236static inline void
1363rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1237rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1364{ 1238{
1365 pr_alert("%s" TORTURE_FLAG 1239 printk(KERN_ALERT "%s" TORTURE_FLAG
1366 "--- %s: nreaders=%d nfakewriters=%d " 1240 "--- %s: nreaders=%d nfakewriters=%d "
1367 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1241 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1368 "shuffle_interval=%d stutter=%d irqreader=%d " 1242 "shuffle_interval=%d stutter=%d irqreader=%d "
1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1243 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1370 "test_boost=%d/%d test_boost_interval=%d " 1244 "test_boost=%d/%d test_boost_interval=%d "
1371 "test_boost_duration=%d shutdown_secs=%d " 1245 "test_boost_duration=%d\n",
1372 "stall_cpu=%d stall_cpu_holdoff=%d " 1246 torture_type, tag, nrealreaders, nfakewriters,
1373 "n_barrier_cbs=%d " 1247 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1374 "onoff_interval=%d onoff_holdoff=%d\n", 1248 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1375 torture_type, tag, nrealreaders, nfakewriters, 1249 test_boost, cur_ops->can_boost,
1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1250 test_boost_interval, test_boost_duration);
1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1378 test_boost, cur_ops->can_boost,
1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1382 onoff_interval, onoff_holdoff);
1383} 1251}
1384 1252
1385static struct notifier_block rcutorture_shutdown_nb = { 1253static struct notifier_block rcutorture_shutdown_nb = {
@@ -1400,7 +1268,6 @@ static void rcutorture_booster_cleanup(int cpu)
1400 1268
1401 /* This must be outside of the mutex, otherwise deadlock! */ 1269 /* This must be outside of the mutex, otherwise deadlock! */
1402 kthread_stop(t); 1270 kthread_stop(t);
1403 boost_tasks[cpu] = NULL;
1404} 1271}
1405 1272
1406static int rcutorture_booster_init(int cpu) 1273static int rcutorture_booster_init(int cpu)
@@ -1413,9 +1280,8 @@ static int rcutorture_booster_init(int cpu)
1413 /* Don't allow time recalculation while creating a new task. */ 1280 /* Don't allow time recalculation while creating a new task. */
1414 mutex_lock(&boost_mutex); 1281 mutex_lock(&boost_mutex);
1415 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1282 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1416 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, 1283 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1417 cpu_to_node(cpu), 1284 "rcu_torture_boost");
1418 "rcu_torture_boost");
1419 if (IS_ERR(boost_tasks[cpu])) { 1285 if (IS_ERR(boost_tasks[cpu])) {
1420 retval = PTR_ERR(boost_tasks[cpu]); 1286 retval = PTR_ERR(boost_tasks[cpu]);
1421 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1287 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1430,376 +1296,6 @@ static int rcutorture_booster_init(int cpu)
1430 return 0; 1296 return 0;
1431} 1297}
1432 1298
1433/*
1434 * Cause the rcutorture test to shutdown the system after the test has
1435 * run for the time specified by the shutdown_secs module parameter.
1436 */
1437static int
1438rcu_torture_shutdown(void *arg)
1439{
1440 long delta;
1441 unsigned long jiffies_snap;
1442
1443 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1444 jiffies_snap = ACCESS_ONCE(jiffies);
1445 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1446 !kthread_should_stop()) {
1447 delta = shutdown_time - jiffies_snap;
1448 if (verbose)
1449 pr_alert("%s" TORTURE_FLAG
1450 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1451 torture_type, delta);
1452 schedule_timeout_interruptible(delta);
1453 jiffies_snap = ACCESS_ONCE(jiffies);
1454 }
1455 if (kthread_should_stop()) {
1456 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1457 return 0;
1458 }
1459
1460 /* OK, shut down the system. */
1461
1462 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1463 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1464 rcu_torture_cleanup(); /* Get the success/failure message. */
1465 kernel_power_off(); /* Shut down the system. */
1466 return 0;
1467}
1468
1469#ifdef CONFIG_HOTPLUG_CPU
1470
1471/*
1472 * Execute random CPU-hotplug operations at the interval specified
1473 * by the onoff_interval.
1474 */
1475static int __cpuinit
1476rcu_torture_onoff(void *arg)
1477{
1478 int cpu;
1479 unsigned long delta;
1480 int maxcpu = -1;
1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1483 unsigned long starttime;
1484
1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1486 for_each_online_cpu(cpu)
1487 maxcpu = cpu;
1488 WARN_ON(maxcpu < 0);
1489 if (onoff_holdoff > 0) {
1490 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1491 schedule_timeout_interruptible(onoff_holdoff * HZ);
1492 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1493 }
1494 while (!kthread_should_stop()) {
1495 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1496 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1497 if (verbose)
1498 pr_alert("%s" TORTURE_FLAG
1499 "rcu_torture_onoff task: offlining %d\n",
1500 torture_type, cpu);
1501 starttime = jiffies;
1502 n_offline_attempts++;
1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1510 if (verbose)
1511 pr_alert("%s" TORTURE_FLAG
1512 "rcu_torture_onoff task: offlined %d\n",
1513 torture_type, cpu);
1514 n_offline_successes++;
1515 delta = jiffies - starttime;
1516 sum_offline += delta;
1517 if (min_offline < 0) {
1518 min_offline = delta;
1519 max_offline = delta;
1520 }
1521 if (min_offline > delta)
1522 min_offline = delta;
1523 if (max_offline < delta)
1524 max_offline = delta;
1525 }
1526 } else if (cpu_is_hotpluggable(cpu)) {
1527 if (verbose)
1528 pr_alert("%s" TORTURE_FLAG
1529 "rcu_torture_onoff task: onlining %d\n",
1530 torture_type, cpu);
1531 starttime = jiffies;
1532 n_online_attempts++;
1533 if (cpu_up(cpu) == 0) {
1534 if (verbose)
1535 pr_alert("%s" TORTURE_FLAG
1536 "rcu_torture_onoff task: onlined %d\n",
1537 torture_type, cpu);
1538 n_online_successes++;
1539 delta = jiffies - starttime;
1540 sum_online += delta;
1541 if (min_online < 0) {
1542 min_online = delta;
1543 max_online = delta;
1544 }
1545 if (min_online > delta)
1546 min_online = delta;
1547 if (max_online < delta)
1548 max_online = delta;
1549 }
1550 }
1551 schedule_timeout_interruptible(onoff_interval * HZ);
1552 }
1553 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1554 return 0;
1555}
1556
1557static int __cpuinit
1558rcu_torture_onoff_init(void)
1559{
1560 int ret;
1561
1562 if (onoff_interval <= 0)
1563 return 0;
1564 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1565 if (IS_ERR(onoff_task)) {
1566 ret = PTR_ERR(onoff_task);
1567 onoff_task = NULL;
1568 return ret;
1569 }
1570 return 0;
1571}
1572
1573static void rcu_torture_onoff_cleanup(void)
1574{
1575 if (onoff_task == NULL)
1576 return;
1577 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1578 kthread_stop(onoff_task);
1579 onoff_task = NULL;
1580}
1581
1582#else /* #ifdef CONFIG_HOTPLUG_CPU */
1583
1584static int
1585rcu_torture_onoff_init(void)
1586{
1587 return 0;
1588}
1589
1590static void rcu_torture_onoff_cleanup(void)
1591{
1592}
1593
1594#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1595
1596/*
1597 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1598 * induces a CPU stall for the time specified by stall_cpu.
1599 */
1600static int __cpuinit rcu_torture_stall(void *args)
1601{
1602 unsigned long stop_at;
1603
1604 VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
1605 if (stall_cpu_holdoff > 0) {
1606 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
1607 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1608 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
1609 }
1610 if (!kthread_should_stop()) {
1611 stop_at = get_seconds() + stall_cpu;
1612 /* RCU CPU stall is expected behavior in following code. */
1613 pr_alert("rcu_torture_stall start.\n");
1614 rcu_read_lock();
1615 preempt_disable();
1616 while (ULONG_CMP_LT(get_seconds(), stop_at))
1617 continue; /* Induce RCU CPU stall warning. */
1618 preempt_enable();
1619 rcu_read_unlock();
1620 pr_alert("rcu_torture_stall end.\n");
1621 }
1622 rcutorture_shutdown_absorb("rcu_torture_stall");
1623 while (!kthread_should_stop())
1624 schedule_timeout_interruptible(10 * HZ);
1625 return 0;
1626}
1627
1628/* Spawn CPU-stall kthread, if stall_cpu specified. */
1629static int __init rcu_torture_stall_init(void)
1630{
1631 int ret;
1632
1633 if (stall_cpu <= 0)
1634 return 0;
1635 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
1636 if (IS_ERR(stall_task)) {
1637 ret = PTR_ERR(stall_task);
1638 stall_task = NULL;
1639 return ret;
1640 }
1641 return 0;
1642}
1643
1644/* Clean up after the CPU-stall kthread, if one was spawned. */
1645static void rcu_torture_stall_cleanup(void)
1646{
1647 if (stall_task == NULL)
1648 return;
1649 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1650 kthread_stop(stall_task);
1651 stall_task = NULL;
1652}
1653
1654/* Callback function for RCU barrier testing. */
1655void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1656{
1657 atomic_inc(&barrier_cbs_invoked);
1658}
1659
1660/* kthread function to register callbacks used to test RCU barriers. */
1661static int rcu_torture_barrier_cbs(void *arg)
1662{
1663 long myid = (long)arg;
1664 bool lastphase = 0;
1665 struct rcu_head rcu;
1666
1667 init_rcu_head_on_stack(&rcu);
1668 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1669 set_user_nice(current, 19);
1670 do {
1671 wait_event(barrier_cbs_wq[myid],
1672 barrier_phase != lastphase ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 lastphase = barrier_phase;
1676 smp_mb(); /* ensure barrier_phase load before ->call(). */
1677 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1678 break;
1679 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1680 if (atomic_dec_and_test(&barrier_cbs_count))
1681 wake_up(&barrier_wq);
1682 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1683 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1684 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1685 while (!kthread_should_stop())
1686 schedule_timeout_interruptible(1);
1687 cur_ops->cb_barrier();
1688 destroy_rcu_head_on_stack(&rcu);
1689 return 0;
1690}
1691
1692/* kthread function to drive and coordinate RCU barrier testing. */
1693static int rcu_torture_barrier(void *arg)
1694{
1695 int i;
1696
1697 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1698 do {
1699 atomic_set(&barrier_cbs_invoked, 0);
1700 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1701 smp_mb(); /* Ensure barrier_phase after prior assignments. */
1702 barrier_phase = !barrier_phase;
1703 for (i = 0; i < n_barrier_cbs; i++)
1704 wake_up(&barrier_cbs_wq[i]);
1705 wait_event(barrier_wq,
1706 atomic_read(&barrier_cbs_count) == 0 ||
1707 kthread_should_stop() ||
1708 fullstop != FULLSTOP_DONTSTOP);
1709 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1710 break;
1711 n_barrier_attempts++;
1712 cur_ops->cb_barrier();
1713 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1714 n_rcu_torture_barrier_error++;
1715 WARN_ON_ONCE(1);
1716 }
1717 n_barrier_successes++;
1718 schedule_timeout_interruptible(HZ / 10);
1719 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1720 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1721 rcutorture_shutdown_absorb("rcu_torture_barrier");
1722 while (!kthread_should_stop())
1723 schedule_timeout_interruptible(1);
1724 return 0;
1725}
1726
1727/* Initialize RCU barrier testing. */
1728static int rcu_torture_barrier_init(void)
1729{
1730 int i;
1731 int ret;
1732
1733 if (n_barrier_cbs == 0)
1734 return 0;
1735 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1736 pr_alert("%s" TORTURE_FLAG
1737 " Call or barrier ops missing for %s,\n",
1738 torture_type, cur_ops->name);
1739 pr_alert("%s" TORTURE_FLAG
1740 " RCU barrier testing omitted from run.\n",
1741 torture_type);
1742 return 0;
1743 }
1744 atomic_set(&barrier_cbs_count, 0);
1745 atomic_set(&barrier_cbs_invoked, 0);
1746 barrier_cbs_tasks =
1747 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1748 GFP_KERNEL);
1749 barrier_cbs_wq =
1750 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1751 GFP_KERNEL);
1752 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1753 return -ENOMEM;
1754 for (i = 0; i < n_barrier_cbs; i++) {
1755 init_waitqueue_head(&barrier_cbs_wq[i]);
1756 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1757 (void *)(long)i,
1758 "rcu_torture_barrier_cbs");
1759 if (IS_ERR(barrier_cbs_tasks[i])) {
1760 ret = PTR_ERR(barrier_cbs_tasks[i]);
1761 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1762 barrier_cbs_tasks[i] = NULL;
1763 return ret;
1764 }
1765 }
1766 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1767 "rcu_torture_barrier");
1768 if (IS_ERR(barrier_task)) {
1769 ret = PTR_ERR(barrier_task);
1770 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1771 barrier_task = NULL;
1772 }
1773 return 0;
1774}
1775
1776/* Clean up after RCU barrier testing. */
1777static void rcu_torture_barrier_cleanup(void)
1778{
1779 int i;
1780
1781 if (barrier_task != NULL) {
1782 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1783 kthread_stop(barrier_task);
1784 barrier_task = NULL;
1785 }
1786 if (barrier_cbs_tasks != NULL) {
1787 for (i = 0; i < n_barrier_cbs; i++) {
1788 if (barrier_cbs_tasks[i] != NULL) {
1789 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1790 kthread_stop(barrier_cbs_tasks[i]);
1791 barrier_cbs_tasks[i] = NULL;
1792 }
1793 }
1794 kfree(barrier_cbs_tasks);
1795 barrier_cbs_tasks = NULL;
1796 }
1797 if (barrier_cbs_wq != NULL) {
1798 kfree(barrier_cbs_wq);
1799 barrier_cbs_wq = NULL;
1800 }
1801}
1802
1803static int rcutorture_cpu_notify(struct notifier_block *self, 1299static int rcutorture_cpu_notify(struct notifier_block *self,
1804 unsigned long action, void *hcpu) 1300 unsigned long action, void *hcpu)
1805{ 1301{
@@ -1831,7 +1327,7 @@ rcu_torture_cleanup(void)
1831 mutex_lock(&fullstop_mutex); 1327 mutex_lock(&fullstop_mutex);
1832 rcutorture_record_test_transition(); 1328 rcutorture_record_test_transition();
1833 if (fullstop == FULLSTOP_SHUTDOWN) { 1329 if (fullstop == FULLSTOP_SHUTDOWN) {
1834 pr_warn(/* but going down anyway, so... */ 1330 printk(KERN_WARNING /* but going down anyway, so... */
1835 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1331 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1836 mutex_unlock(&fullstop_mutex); 1332 mutex_unlock(&fullstop_mutex);
1837 schedule_timeout_uninterruptible(10); 1333 schedule_timeout_uninterruptible(10);
@@ -1842,8 +1338,6 @@ rcu_torture_cleanup(void)
1842 fullstop = FULLSTOP_RMMOD; 1338 fullstop = FULLSTOP_RMMOD;
1843 mutex_unlock(&fullstop_mutex); 1339 mutex_unlock(&fullstop_mutex);
1844 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1340 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1845 rcu_torture_barrier_cleanup();
1846 rcu_torture_stall_cleanup();
1847 if (stutter_task) { 1341 if (stutter_task) {
1848 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1342 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1849 kthread_stop(stutter_task); 1343 kthread_stop(stutter_task);
@@ -1906,12 +1400,6 @@ rcu_torture_cleanup(void)
1906 for_each_possible_cpu(i) 1400 for_each_possible_cpu(i)
1907 rcutorture_booster_cleanup(i); 1401 rcutorture_booster_cleanup(i);
1908 } 1402 }
1909 if (shutdown_task != NULL) {
1910 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1911 kthread_stop(shutdown_task);
1912 }
1913 shutdown_task = NULL;
1914 rcu_torture_onoff_cleanup();
1915 1403
1916 /* Wait for all RCU callbacks to fire. */ 1404 /* Wait for all RCU callbacks to fire. */
1917 1405
@@ -1920,12 +1408,10 @@ rcu_torture_cleanup(void)
1920 1408
1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1409 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1922 1410
1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1411 if (cur_ops->cleanup)
1412 cur_ops->cleanup();
1413 if (atomic_read(&n_rcu_torture_error))
1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1414 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1925 else if (n_online_successes != n_online_attempts ||
1926 n_offline_successes != n_offline_attempts)
1927 rcu_torture_print_module_parms(cur_ops,
1928 "End of test: RCU_HOTPLUG");
1929 else 1415 else
1930 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1416 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1931} 1417}
@@ -1936,12 +1422,10 @@ rcu_torture_init(void)
1936 int i; 1422 int i;
1937 int cpu; 1423 int cpu;
1938 int firsterr = 0; 1424 int firsterr = 0;
1939 int retval;
1940 static struct rcu_torture_ops *torture_ops[] = 1425 static struct rcu_torture_ops *torture_ops[] =
1941 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1426 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1942 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1427 &rcu_bh_ops, &rcu_bh_sync_ops,
1943 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, 1428 &srcu_ops, &srcu_expedited_ops,
1944 &srcu_raw_ops, &srcu_raw_sync_ops,
1945 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1429 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1946 1430
1947 mutex_lock(&fullstop_mutex); 1431 mutex_lock(&fullstop_mutex);
@@ -1953,17 +1437,18 @@ rcu_torture_init(void)
1953 break; 1437 break;
1954 } 1438 }
1955 if (i == ARRAY_SIZE(torture_ops)) { 1439 if (i == ARRAY_SIZE(torture_ops)) {
1956 pr_alert("rcu-torture: invalid torture type: \"%s\"\n", 1440 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1957 torture_type); 1441 torture_type);
1958 pr_alert("rcu-torture types:"); 1442 printk(KERN_ALERT "rcu-torture types:");
1959 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1443 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1960 pr_alert(" %s", torture_ops[i]->name); 1444 printk(KERN_ALERT " %s", torture_ops[i]->name);
1961 pr_alert("\n"); 1445 printk(KERN_ALERT "\n");
1962 mutex_unlock(&fullstop_mutex); 1446 mutex_unlock(&fullstop_mutex);
1963 return -EINVAL; 1447 return -EINVAL;
1964 } 1448 }
1965 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1449 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1966 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); 1450 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1451 "fqs_duration, fqs disabled.\n");
1967 fqs_duration = 0; 1452 fqs_duration = 0;
1968 } 1453 }
1969 if (cur_ops->init) 1454 if (cur_ops->init)
@@ -1994,7 +1479,6 @@ rcu_torture_init(void)
1994 atomic_set(&n_rcu_torture_free, 0); 1479 atomic_set(&n_rcu_torture_free, 0);
1995 atomic_set(&n_rcu_torture_mberror, 0); 1480 atomic_set(&n_rcu_torture_mberror, 0);
1996 atomic_set(&n_rcu_torture_error, 0); 1481 atomic_set(&n_rcu_torture_error, 0);
1997 n_rcu_torture_barrier_error = 0;
1998 n_rcu_torture_boost_ktrerror = 0; 1482 n_rcu_torture_boost_ktrerror = 0;
1999 n_rcu_torture_boost_rterror = 0; 1483 n_rcu_torture_boost_rterror = 0;
2000 n_rcu_torture_boost_failure = 0; 1484 n_rcu_torture_boost_failure = 0;
@@ -2011,15 +1495,14 @@ rcu_torture_init(void)
2011 /* Start up the kthreads. */ 1495 /* Start up the kthreads. */
2012 1496
2013 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 1497 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
2014 writer_task = kthread_create(rcu_torture_writer, NULL, 1498 writer_task = kthread_run(rcu_torture_writer, NULL,
2015 "rcu_torture_writer"); 1499 "rcu_torture_writer");
2016 if (IS_ERR(writer_task)) { 1500 if (IS_ERR(writer_task)) {
2017 firsterr = PTR_ERR(writer_task); 1501 firsterr = PTR_ERR(writer_task);
2018 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 1502 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
2019 writer_task = NULL; 1503 writer_task = NULL;
2020 goto unwind; 1504 goto unwind;
2021 } 1505 }
2022 wake_up_process(writer_task);
2023 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1506 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
2024 GFP_KERNEL); 1507 GFP_KERNEL);
2025 if (fakewriter_tasks == NULL) { 1508 if (fakewriter_tasks == NULL) {
@@ -2119,6 +1602,7 @@ rcu_torture_init(void)
2119 test_boost_duration = 2; 1602 test_boost_duration = 2;
2120 if ((test_boost == 1 && cur_ops->can_boost) || 1603 if ((test_boost == 1 && cur_ops->can_boost) ||
2121 test_boost == 2) { 1604 test_boost == 2) {
1605 int retval;
2122 1606
2123 boost_starttime = jiffies + test_boost_interval * HZ; 1607 boost_starttime = jiffies + test_boost_interval * HZ;
2124 register_cpu_notifier(&rcutorture_cpu_nb); 1608 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -2132,34 +1616,7 @@ rcu_torture_init(void)
2132 } 1616 }
2133 } 1617 }
2134 } 1618 }
2135 if (shutdown_secs > 0) {
2136 shutdown_time = jiffies + shutdown_secs * HZ;
2137 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2138 "rcu_torture_shutdown");
2139 if (IS_ERR(shutdown_task)) {
2140 firsterr = PTR_ERR(shutdown_task);
2141 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2142 shutdown_task = NULL;
2143 goto unwind;
2144 }
2145 wake_up_process(shutdown_task);
2146 }
2147 i = rcu_torture_onoff_init();
2148 if (i != 0) {
2149 firsterr = i;
2150 goto unwind;
2151 }
2152 register_reboot_notifier(&rcutorture_shutdown_nb); 1619 register_reboot_notifier(&rcutorture_shutdown_nb);
2153 i = rcu_torture_stall_init();
2154 if (i != 0) {
2155 firsterr = i;
2156 goto unwind;
2157 }
2158 retval = rcu_torture_barrier_init();
2159 if (retval != 0) {
2160 firsterr = retval;
2161 goto unwind;
2162 }
2163 rcutorture_record_test_transition(); 1620 rcutorture_record_test_transition();
2164 mutex_unlock(&fullstop_mutex); 1621 mutex_unlock(&fullstop_mutex);
2165 return 0; 1622 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614..ba06207b1dd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <linux/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/export.h> 41#include <linux/module.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/percpu.h> 44#include <linux/percpu.h>
@@ -50,56 +50,39 @@
50#include <linux/wait.h> 50#include <linux/wait.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h>
54#include <linux/stop_machine.h>
55#include <linux/random.h>
56 53
57#include "rcutree.h" 54#include "rcutree.h"
58#include <trace/events/rcu.h>
59
60#include "rcu.h"
61 55
62/* Data structures. */ 56/* Data structures. */
63 57
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 58static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 59
66 60#define RCU_STATE_INITIALIZER(structname) { \
67#define RCU_STATE_INITIALIZER(sname, cr) { \ 61 .level = { &structname.node[0] }, \
68 .level = { &sname##_state.node[0] }, \ 62 .levelcnt = { \
69 .call = cr, \ 63 NUM_RCU_LVL_0, /* root of hierarchy. */ \
70 .fqs_state = RCU_GP_IDLE, \ 64 NUM_RCU_LVL_1, \
71 .gpnum = 0UL - 300UL, \ 65 NUM_RCU_LVL_2, \
72 .completed = 0UL - 300UL, \ 66 NUM_RCU_LVL_3, \
73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 67 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 68 }, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 69 .signaled = RCU_GP_IDLE, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 70 .gpnum = -300, \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 71 .completed = -300, \
78 .name = #sname, \ 72 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
79} 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
80 74 .n_force_qs = 0, \
81struct rcu_state rcu_sched_state = 75 .n_force_qs_ngp = 0, \
82 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); 76 .name = #structname, \
77}
78
79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 80DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 81
85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 84
88static struct rcu_state *rcu_state; 85static struct rcu_state *rcu_state;
89LIST_HEAD(rcu_struct_flavors);
90
91/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
92static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
93module_param(rcu_fanout_leaf, int, 0444);
94int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
95static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
96 NUM_RCU_LVL_0,
97 NUM_RCU_LVL_1,
98 NUM_RCU_LVL_2,
99 NUM_RCU_LVL_3,
100 NUM_RCU_LVL_4,
101};
102int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
103 86
104/* 87/*
105 * The rcu_scheduler_active variable transitions from zero to one just 88 * The rcu_scheduler_active variable transitions from zero to one just
@@ -135,15 +118,18 @@ static int rcu_scheduler_fully_active __read_mostly;
135 */ 118 */
136static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); 119static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 120DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
121DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
138DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 122DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
139DEFINE_PER_CPU(char, rcu_cpu_has_work); 123DEFINE_PER_CPU(char, rcu_cpu_has_work);
140 124
141#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
142 126
143static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 127static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
144static void invoke_rcu_core(void); 128static void invoke_rcu_core(void);
145static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
146 130
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
147/* 133/*
148 * Track the rcutorture test sequence number and the update version 134 * Track the rcutorture test sequence number and the update version
149 * number within a given test. The rcutorture_testseq is incremented 135 * number within a given test. The rcutorture_testseq is incremented
@@ -170,67 +156,55 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
170 * Note a quiescent state. Because we do not need to know 156 * Note a quiescent state. Because we do not need to know
171 * how many quiescent states passed, just if there was at least 157 * how many quiescent states passed, just if there was at least
172 * one since the start of the grace period, this just sets a flag. 158 * one since the start of the grace period, this just sets a flag.
173 * The caller must have disabled preemption.
174 */ 159 */
175void rcu_sched_qs(int cpu) 160void rcu_sched_qs(int cpu)
176{ 161{
177 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 162 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
178 163
179 if (rdp->passed_quiesce == 0) 164 rdp->passed_quiesc_completed = rdp->gpnum - 1;
180 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 165 barrier();
181 rdp->passed_quiesce = 1; 166 rdp->passed_quiesc = 1;
182} 167}
183 168
184void rcu_bh_qs(int cpu) 169void rcu_bh_qs(int cpu)
185{ 170{
186 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 171 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
187 172
188 if (rdp->passed_quiesce == 0) 173 rdp->passed_quiesc_completed = rdp->gpnum - 1;
189 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 174 barrier();
190 rdp->passed_quiesce = 1; 175 rdp->passed_quiesc = 1;
191} 176}
192 177
193/* 178/*
194 * Note a context switch. This is a quiescent state for RCU-sched, 179 * Note a context switch. This is a quiescent state for RCU-sched,
195 * and requires special handling for preemptible RCU. 180 * and requires special handling for preemptible RCU.
196 * The caller must have disabled preemption.
197 */ 181 */
198void rcu_note_context_switch(int cpu) 182void rcu_note_context_switch(int cpu)
199{ 183{
200 trace_rcu_utilization("Start context switch");
201 rcu_sched_qs(cpu); 184 rcu_sched_qs(cpu);
202 rcu_preempt_note_context_switch(cpu); 185 rcu_preempt_note_context_switch(cpu);
203 trace_rcu_utilization("End context switch");
204} 186}
205EXPORT_SYMBOL_GPL(rcu_note_context_switch); 187EXPORT_SYMBOL_GPL(rcu_note_context_switch);
206 188
189#ifdef CONFIG_NO_HZ
207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 190DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 191 .dynticks_nesting = 1,
209 .dynticks = ATOMIC_INIT(1), 192 .dynticks = ATOMIC_INIT(1),
210}; 193};
194#endif /* #ifdef CONFIG_NO_HZ */
211 195
212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 196static int blimit = 10; /* Maximum callbacks per softirq. */
213static long qhimark = 10000; /* If this many pending, ignore blimit. */ 197static int qhimark = 10000; /* If this many pending, ignore blimit. */
214static long qlowmark = 100; /* Once only this many pending, use blimit. */ 198static int qlowmark = 100; /* Once only this many pending, use blimit. */
215
216module_param(blimit, long, 0444);
217module_param(qhimark, long, 0444);
218module_param(qlowmark, long, 0444);
219 199
220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 200module_param(blimit, int, 0);
221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 201module_param(qhimark, int, 0);
202module_param(qlowmark, int, 0);
222 203
204int rcu_cpu_stall_suppress __read_mostly;
223module_param(rcu_cpu_stall_suppress, int, 0644); 205module_param(rcu_cpu_stall_suppress, int, 0644);
224module_param(rcu_cpu_stall_timeout, int, 0644);
225 206
226static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 207static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
227static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228
229module_param(jiffies_till_first_fqs, ulong, 0644);
230module_param(jiffies_till_next_fqs, ulong, 0644);
231
232static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
233static void force_quiescent_state(struct rcu_state *rsp);
234static int rcu_pending(int cpu); 208static int rcu_pending(int cpu);
235 209
236/* 210/*
@@ -256,7 +230,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
256 */ 230 */
257void rcu_bh_force_quiescent_state(void) 231void rcu_bh_force_quiescent_state(void)
258{ 232{
259 force_quiescent_state(&rcu_bh_state); 233 force_quiescent_state(&rcu_bh_state, 0);
260} 234}
261EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 235EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
262 236
@@ -290,7 +264,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
290 */ 264 */
291void rcu_sched_force_quiescent_state(void) 265void rcu_sched_force_quiescent_state(void)
292{ 266{
293 force_quiescent_state(&rcu_sched_state); 267 force_quiescent_state(&rcu_sched_state, 0);
294} 268}
295EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 269EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
296 270
@@ -300,8 +274,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
300static int 274static int
301cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 275cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
302{ 276{
303 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && 277 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
304 rdp->nxttail[RCU_DONE_TAIL] != NULL;
305} 278}
306 279
307/* 280/*
@@ -310,12 +283,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
310static int 283static int
311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 284cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
312{ 285{
313 struct rcu_head **ntp; 286 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
314
315 ntp = rdp->nxttail[RCU_DONE_TAIL +
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
318 !rcu_gp_in_progress(rsp);
319} 287}
320 288
321/* 289/*
@@ -326,294 +294,103 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
326 return &rsp->node[0]; 294 return &rsp->node[0];
327} 295}
328 296
329/* 297#ifdef CONFIG_SMP
330 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
331 *
332 * If the new value of the ->dynticks_nesting counter now is zero,
333 * we really have entered idle, and must do the appropriate accounting.
334 * The caller must have disabled interrupts.
335 */
336static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337 bool user)
338{
339 trace_rcu_dyntick("Start", oldval, 0);
340 if (!user && !is_idle_task(current)) {
341 struct task_struct *idle = idle_task(smp_processor_id());
342
343 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
344 ftrace_dump(DUMP_ORIG);
345 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
346 current->pid, current->comm,
347 idle->pid, idle->comm); /* must be idle task! */
348 }
349 rcu_prepare_for_idle(smp_processor_id());
350 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
351 smp_mb__before_atomic_inc(); /* See above. */
352 atomic_inc(&rdtp->dynticks);
353 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
354 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
355
356 /*
357 * It is illegal to enter an extended quiescent state while
358 * in an RCU read-side critical section.
359 */
360 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
361 "Illegal idle entry in RCU read-side critical section.");
362 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
363 "Illegal idle entry in RCU-bh read-side critical section.");
364 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
365 "Illegal idle entry in RCU-sched read-side critical section.");
366}
367 298
368/* 299/*
369 * Enter an RCU extended quiescent state, which can be either the 300 * If the specified CPU is offline, tell the caller that it is in
370 * idle loop or adaptive-tickless usermode execution. 301 * a quiescent state. Otherwise, whack it with a reschedule IPI.
302 * Grace periods can end up waiting on an offline CPU when that
303 * CPU is in the process of coming online -- it will be added to the
304 * rcu_node bitmasks before it actually makes it online. The same thing
305 * can happen while a CPU is in the process of coming online. Because this
306 * race is quite rare, we check for it after detecting that the grace
307 * period has been delayed rather than checking each and every CPU
308 * each and every time we start a new grace period.
371 */ 309 */
372static void rcu_eqs_enter(bool user) 310static int rcu_implicit_offline_qs(struct rcu_data *rdp)
373{ 311{
374 long long oldval; 312 /*
375 struct rcu_dynticks *rdtp; 313 * If the CPU is offline, it is in a quiescent state. We can
314 * trust its state not to change because interrupts are disabled.
315 */
316 if (cpu_is_offline(rdp->cpu)) {
317 rdp->offline_fqs++;
318 return 1;
319 }
376 320
377 rdtp = &__get_cpu_var(rcu_dynticks); 321 /* If preemptible RCU, no point in sending reschedule IPI. */
378 oldval = rdtp->dynticks_nesting; 322 if (rdp->preemptible)
379 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 323 return 0;
380 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 324
381 rdtp->dynticks_nesting = 0; 325 /* The CPU is online, so send it a reschedule IPI. */
326 if (rdp->cpu != smp_processor_id())
327 smp_send_reschedule(rdp->cpu);
382 else 328 else
383 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 329 set_need_resched();
384 rcu_eqs_enter_common(rdtp, oldval, user); 330 rdp->resched_ipi++;
331 return 0;
385} 332}
386 333
387/** 334#endif /* #ifdef CONFIG_SMP */
388 * rcu_idle_enter - inform RCU that current CPU is entering idle
389 *
390 * Enter idle mode, in other words, -leave- the mode in which RCU
391 * read-side critical sections can occur. (Though RCU read-side
392 * critical sections can occur in irq handlers in idle, a possibility
393 * handled by irq_enter() and irq_exit().)
394 *
395 * We crowbar the ->dynticks_nesting field to zero to allow for
396 * the possibility of usermode upcalls having messed up our count
397 * of interrupt nesting level during the prior busy period.
398 */
399void rcu_idle_enter(void)
400{
401 unsigned long flags;
402 335
403 local_irq_save(flags); 336#ifdef CONFIG_NO_HZ
404 rcu_eqs_enter(false);
405 local_irq_restore(flags);
406}
407EXPORT_SYMBOL_GPL(rcu_idle_enter);
408
409#ifdef CONFIG_RCU_USER_QS
410/**
411 * rcu_user_enter - inform RCU that we are resuming userspace.
412 *
413 * Enter RCU idle mode right before resuming userspace. No use of RCU
414 * is permitted between this call and rcu_user_exit(). This way the
415 * CPU doesn't need to maintain the tick for RCU maintenance purposes
416 * when the CPU runs in userspace.
417 */
418void rcu_user_enter(void)
419{
420 rcu_eqs_enter(1);
421}
422 337
423/** 338/**
424 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace 339 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
425 * after the current irq returns.
426 * 340 *
427 * This is similar to rcu_user_enter() but in the context of a non-nesting 341 * Enter nohz mode, in other words, -leave- the mode in which RCU
428 * irq. After this call, RCU enters into idle mode when the interrupt 342 * read-side critical sections can occur. (Though RCU read-side
429 * returns. 343 * critical sections can occur in irq handlers in nohz mode, a possibility
344 * handled by rcu_irq_enter() and rcu_irq_exit()).
430 */ 345 */
431void rcu_user_enter_after_irq(void) 346void rcu_enter_nohz(void)
432{ 347{
433 unsigned long flags; 348 unsigned long flags;
434 struct rcu_dynticks *rdtp; 349 struct rcu_dynticks *rdtp;
435 350
436 local_irq_save(flags); 351 local_irq_save(flags);
437 rdtp = &__get_cpu_var(rcu_dynticks); 352 rdtp = &__get_cpu_var(rcu_dynticks);
438 /* Ensure this irq is interrupting a non-idle RCU state. */ 353 if (--rdtp->dynticks_nesting) {
439 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); 354 local_irq_restore(flags);
440 rdtp->dynticks_nesting = 1; 355 return;
356 }
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
441 local_irq_restore(flags); 362 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
442} 370}
443#endif /* CONFIG_RCU_USER_QS */
444 371
445/** 372/*
446 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 373 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
447 *
448 * Exit from an interrupt handler, which might possibly result in entering
449 * idle mode, in other words, leaving the mode in which read-side critical
450 * sections can occur.
451 *
452 * This code assumes that the idle loop never does anything that might
453 * result in unbalanced calls to irq_enter() and irq_exit(). If your
454 * architecture violates this assumption, RCU will give you what you
455 * deserve, good and hard. But very infrequently and irreproducibly.
456 *
457 * Use things like work queues to work around this limitation.
458 * 374 *
459 * You have been warned. 375 * Exit nohz mode, in other words, -enter- the mode in which RCU
376 * read-side critical sections normally occur.
460 */ 377 */
461void rcu_irq_exit(void) 378void rcu_exit_nohz(void)
462{ 379{
463 unsigned long flags; 380 unsigned long flags;
464 long long oldval;
465 struct rcu_dynticks *rdtp; 381 struct rcu_dynticks *rdtp;
466 382
467 local_irq_save(flags); 383 local_irq_save(flags);
468 rdtp = &__get_cpu_var(rcu_dynticks); 384 rdtp = &__get_cpu_var(rcu_dynticks);
469 oldval = rdtp->dynticks_nesting; 385 if (rdtp->dynticks_nesting++) {
470 rdtp->dynticks_nesting--; 386 local_irq_restore(flags);
471 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 387 return;
472 if (rdtp->dynticks_nesting) 388 }
473 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
474 else
475 rcu_eqs_enter_common(rdtp, oldval, true);
476 local_irq_restore(flags);
477}
478
479/*
480 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
481 *
482 * If the new value of the ->dynticks_nesting counter was previously zero,
483 * we really have exited idle, and must do the appropriate accounting.
484 * The caller must have disabled interrupts.
485 */
486static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
487 int user)
488{
489 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 389 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
490 atomic_inc(&rdtp->dynticks); 390 atomic_inc(&rdtp->dynticks);
491 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
492 smp_mb__after_atomic_inc(); /* See above. */ 392 smp_mb__after_atomic_inc(); /* See above. */
493 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
494 rcu_cleanup_after_idle(smp_processor_id());
495 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
496 if (!user && !is_idle_task(current)) {
497 struct task_struct *idle = idle_task(smp_processor_id());
498
499 trace_rcu_dyntick("Error on exit: not idle task",
500 oldval, rdtp->dynticks_nesting);
501 ftrace_dump(DUMP_ORIG);
502 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
503 current->pid, current->comm,
504 idle->pid, idle->comm); /* must be idle task! */
505 }
506}
507
508/*
509 * Exit an RCU extended quiescent state, which can be either the
510 * idle loop or adaptive-tickless usermode execution.
511 */
512static void rcu_eqs_exit(bool user)
513{
514 struct rcu_dynticks *rdtp;
515 long long oldval;
516
517 rdtp = &__get_cpu_var(rcu_dynticks);
518 oldval = rdtp->dynticks_nesting;
519 WARN_ON_ONCE(oldval < 0);
520 if (oldval & DYNTICK_TASK_NEST_MASK)
521 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
522 else
523 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
524 rcu_eqs_exit_common(rdtp, oldval, user);
525}
526
527/**
528 * rcu_idle_exit - inform RCU that current CPU is leaving idle
529 *
530 * Exit idle mode, in other words, -enter- the mode in which RCU
531 * read-side critical sections can occur.
532 *
533 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
534 * allow for the possibility of usermode upcalls messing up our count
535 * of interrupt nesting level during the busy period that is just
536 * now starting.
537 */
538void rcu_idle_exit(void)
539{
540 unsigned long flags;
541
542 local_irq_save(flags);
543 rcu_eqs_exit(false);
544 local_irq_restore(flags);
545}
546EXPORT_SYMBOL_GPL(rcu_idle_exit);
547
548#ifdef CONFIG_RCU_USER_QS
549/**
550 * rcu_user_exit - inform RCU that we are exiting userspace.
551 *
552 * Exit RCU idle mode while entering the kernel because it can
553 * run a RCU read side critical section anytime.
554 */
555void rcu_user_exit(void)
556{
557 rcu_eqs_exit(1);
558}
559
560/**
561 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
562 * idle mode after the current non-nesting irq returns.
563 *
564 * This is similar to rcu_user_exit() but in the context of an irq.
565 * This is called when the irq has interrupted a userspace RCU idle mode
566 * context. When the current non-nesting interrupt returns after this call,
567 * the CPU won't restore the RCU idle mode.
568 */
569void rcu_user_exit_after_irq(void)
570{
571 unsigned long flags;
572 struct rcu_dynticks *rdtp;
573
574 local_irq_save(flags);
575 rdtp = &__get_cpu_var(rcu_dynticks);
576 /* Ensure we are interrupting an RCU idle mode. */
577 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
578 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
579 local_irq_restore(flags);
580}
581#endif /* CONFIG_RCU_USER_QS */
582
583/**
584 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
585 *
586 * Enter an interrupt handler, which might possibly result in exiting
587 * idle mode, in other words, entering the mode in which read-side critical
588 * sections can occur.
589 *
590 * Note that the Linux kernel is fully capable of entering an interrupt
591 * handler that it never exits, for example when doing upcalls to
592 * user mode! This code assumes that the idle loop never does upcalls to
593 * user mode. If your architecture does do upcalls from the idle loop (or
594 * does anything else that results in unbalanced calls to the irq_enter()
595 * and irq_exit() functions), RCU will give you what you deserve, good
596 * and hard. But very infrequently and irreproducibly.
597 *
598 * Use things like work queues to work around this limitation.
599 *
600 * You have been warned.
601 */
602void rcu_irq_enter(void)
603{
604 unsigned long flags;
605 struct rcu_dynticks *rdtp;
606 long long oldval;
607
608 local_irq_save(flags);
609 rdtp = &__get_cpu_var(rcu_dynticks);
610 oldval = rdtp->dynticks_nesting;
611 rdtp->dynticks_nesting++;
612 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
613 if (oldval)
614 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
615 else
616 rcu_eqs_exit_common(rdtp, oldval, true);
617 local_irq_restore(flags); 394 local_irq_restore(flags);
618} 395}
619 396
@@ -661,77 +438,30 @@ void rcu_nmi_exit(void)
661} 438}
662 439
663/** 440/**
664 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 441 * rcu_irq_enter - inform RCU of entry to hard irq context
665 *
666 * If the current CPU is in its idle loop and is neither in an interrupt
667 * or NMI handler, return true.
668 */
669int rcu_is_cpu_idle(void)
670{
671 int ret;
672
673 preempt_disable();
674 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
675 preempt_enable();
676 return ret;
677}
678EXPORT_SYMBOL(rcu_is_cpu_idle);
679
680#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
681
682/*
683 * Is the current CPU online? Disable preemption to avoid false positives
684 * that could otherwise happen due to the current CPU number being sampled,
685 * this task being preempted, its old CPU being taken offline, resuming
686 * on some other CPU, then determining that its old CPU is now offline.
687 * It is OK to use RCU on an offline processor during initial boot, hence
688 * the check for rcu_scheduler_fully_active. Note also that it is OK
689 * for a CPU coming online to use RCU for one jiffy prior to marking itself
690 * online in the cpu_online_mask. Similarly, it is OK for a CPU going
691 * offline to continue to use RCU for one jiffy after marking itself
692 * offline in the cpu_online_mask. This leniency is necessary given the
693 * non-atomic nature of the online and offline processing, for example,
694 * the fact that a CPU enters the scheduler after completing the CPU_DYING
695 * notifiers.
696 *
697 * This is also why RCU internally marks CPUs online during the
698 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
699 * 442 *
700 * Disable checking if in an NMI handler because we cannot safely report 443 * If the CPU was idle with dynamic ticks active, this updates the
701 * errors from NMI handlers anyway. 444 * rdtp->dynticks to let the RCU handling know that the CPU is active.
702 */ 445 */
703bool rcu_lockdep_current_cpu_online(void) 446void rcu_irq_enter(void)
704{ 447{
705 struct rcu_data *rdp; 448 rcu_exit_nohz();
706 struct rcu_node *rnp;
707 bool ret;
708
709 if (in_nmi())
710 return 1;
711 preempt_disable();
712 rdp = &__get_cpu_var(rcu_sched_data);
713 rnp = rdp->mynode;
714 ret = (rdp->grpmask & rnp->qsmaskinit) ||
715 !rcu_scheduler_fully_active;
716 preempt_enable();
717 return ret;
718} 449}
719EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
720
721#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
722 450
723/** 451/**
724 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 452 * rcu_irq_exit - inform RCU of exit from hard irq context
725 * 453 *
726 * If the current CPU is idle or running at a first-level (not nested) 454 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
727 * interrupt from idle, return true. The caller must have at least 455 * to put let the RCU handling be aware that the CPU is going back to idle
728 * disabled preemption. 456 * with no ticks.
729 */ 457 */
730int rcu_is_cpu_rrupt_from_idle(void) 458void rcu_irq_exit(void)
731{ 459{
732 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 460 rcu_enter_nohz();
733} 461}
734 462
463#ifdef CONFIG_SMP
464
735/* 465/*
736 * Snapshot the specified CPU's dynticks counter so that we can later 466 * Snapshot the specified CPU's dynticks counter so that we can later
737 * credit them with an implicit quiescent state. Return 1 if this CPU 467 * credit them with an implicit quiescent state. Return 1 if this CPU
@@ -740,22 +470,22 @@ int rcu_is_cpu_rrupt_from_idle(void)
740static int dyntick_save_progress_counter(struct rcu_data *rdp) 470static int dyntick_save_progress_counter(struct rcu_data *rdp)
741{ 471{
742 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 472 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
743 return (rdp->dynticks_snap & 0x1) == 0; 473 return 0;
744} 474}
745 475
746/* 476/*
747 * Return true if the specified CPU has passed through a quiescent 477 * Return true if the specified CPU has passed through a quiescent
748 * state by virtue of being in or having passed through an dynticks 478 * state by virtue of being in or having passed through an dynticks
749 * idle state since the last call to dyntick_save_progress_counter() 479 * idle state since the last call to dyntick_save_progress_counter()
750 * for this same CPU, or by virtue of having been offline. 480 * for this same CPU.
751 */ 481 */
752static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
753{ 483{
754 unsigned int curr; 484 unsigned long curr;
755 unsigned int snap; 485 unsigned long snap;
756 486
757 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
758 snap = (unsigned int)rdp->dynticks_snap; 488 snap = (unsigned long)rdp->dynticks_snap;
759 489
760 /* 490 /*
761 * If the CPU passed through or entered a dynticks idle phase with 491 * If the CPU passed through or entered a dynticks idle phase with
@@ -765,79 +495,41 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
765 * read-side critical section that started before the beginning 495 * read-side critical section that started before the beginning
766 * of the current RCU grace period. 496 * of the current RCU grace period.
767 */ 497 */
768 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
769 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
770 rdp->dynticks_fqs++; 499 rdp->dynticks_fqs++;
771 return 1; 500 return 1;
772 } 501 }
773 502
774 /* 503 /* Go check for the CPU being offline. */
775 * Check for the CPU being offline, but only if the grace period 504 return rcu_implicit_offline_qs(rdp);
776 * is old enough. We don't need to worry about the CPU changing
777 * state: If we see it offline even once, it has been through a
778 * quiescent state.
779 *
780 * The reason for insisting that the grace period be at least
781 * one jiffy old is that CPUs that are not quite online and that
782 * have just gone offline can still execute RCU read-side critical
783 * sections.
784 */
785 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
786 return 0; /* Grace period is not old enough. */
787 barrier();
788 if (cpu_is_offline(rdp->cpu)) {
789 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
790 rdp->offline_fqs++;
791 return 1;
792 }
793 return 0;
794} 505}
795 506
796static int jiffies_till_stall_check(void) 507#endif /* #ifdef CONFIG_SMP */
797{
798 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
799 508
800 /* 509#else /* #ifdef CONFIG_NO_HZ */
801 * Limit check must be consistent with the Kconfig limits
802 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
803 */
804 if (till_stall_check < 3) {
805 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
806 till_stall_check = 3;
807 } else if (till_stall_check > 300) {
808 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
809 till_stall_check = 300;
810 }
811 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
812}
813 510
814static void record_gp_stall_check_time(struct rcu_state *rsp) 511#ifdef CONFIG_SMP
512
513static int dyntick_save_progress_counter(struct rcu_data *rdp)
815{ 514{
816 rsp->gp_start = jiffies; 515 return 0;
817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
818} 516}
819 517
820/* 518static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
821 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
822 * for architectures that do not implement trigger_all_cpu_backtrace().
823 * The NMI-triggered stack traces are more accurate because they are
824 * printed by the target CPU.
825 */
826static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
827{ 519{
828 int cpu; 520 return rcu_implicit_offline_qs(rdp);
829 unsigned long flags; 521}
830 struct rcu_node *rnp;
831 522
832 rcu_for_each_leaf_node(rsp, rnp) { 523#endif /* #ifdef CONFIG_SMP */
833 raw_spin_lock_irqsave(&rnp->lock, flags); 524
834 if (rnp->qsmask != 0) { 525#endif /* #else #ifdef CONFIG_NO_HZ */
835 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 526
836 if (rnp->qsmask & (1UL << cpu)) 527int rcu_cpu_stall_suppress __read_mostly;
837 dump_cpu_task(rnp->grplo + cpu); 528
838 } 529static void record_gp_stall_check_time(struct rcu_state *rsp)
839 raw_spin_unlock_irqrestore(&rnp->lock, flags); 530{
840 } 531 rsp->gp_start = jiffies;
532 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
841} 533}
842 534
843static void print_other_cpu_stall(struct rcu_state *rsp) 535static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -845,9 +537,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
845 int cpu; 537 int cpu;
846 long delta; 538 long delta;
847 unsigned long flags; 539 unsigned long flags;
848 int ndetected = 0;
849 struct rcu_node *rnp = rcu_get_root(rsp); 540 struct rcu_node *rnp = rcu_get_root(rsp);
850 long totqlen = 0;
851 541
852 /* Only let one CPU complain about others per time interval. */ 542 /* Only let one CPU complain about others per time interval. */
853 543
@@ -857,7 +547,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
857 raw_spin_unlock_irqrestore(&rnp->lock, flags); 547 raw_spin_unlock_irqrestore(&rnp->lock, flags);
858 return; 548 return;
859 } 549 }
860 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; 550 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
551
552 /*
553 * Now rat on any tasks that got kicked up to the root rcu_node
554 * due to CPU offlining.
555 */
556 rcu_print_task_stall(rnp);
861 raw_spin_unlock_irqrestore(&rnp->lock, flags); 557 raw_spin_unlock_irqrestore(&rnp->lock, flags);
862 558
863 /* 559 /*
@@ -865,77 +561,47 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
865 * See Documentation/RCU/stallwarn.txt for info on how to debug 561 * See Documentation/RCU/stallwarn.txt for info on how to debug
866 * RCU CPU stall warnings. 562 * RCU CPU stall warnings.
867 */ 563 */
868 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", 564 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
869 rsp->name); 565 rsp->name);
870 print_cpu_stall_info_begin();
871 rcu_for_each_leaf_node(rsp, rnp) { 566 rcu_for_each_leaf_node(rsp, rnp) {
872 raw_spin_lock_irqsave(&rnp->lock, flags); 567 raw_spin_lock_irqsave(&rnp->lock, flags);
873 ndetected += rcu_print_task_stall(rnp); 568 rcu_print_task_stall(rnp);
874 if (rnp->qsmask != 0) {
875 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
876 if (rnp->qsmask & (1UL << cpu)) {
877 print_cpu_stall_info(rsp,
878 rnp->grplo + cpu);
879 ndetected++;
880 }
881 }
882 raw_spin_unlock_irqrestore(&rnp->lock, flags); 569 raw_spin_unlock_irqrestore(&rnp->lock, flags);
570 if (rnp->qsmask == 0)
571 continue;
572 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
573 if (rnp->qsmask & (1UL << cpu))
574 printk(" %d", rnp->grplo + cpu);
883 } 575 }
576 printk("} (detected by %d, t=%ld jiffies)\n",
577 smp_processor_id(), (long)(jiffies - rsp->gp_start));
578 trigger_all_cpu_backtrace();
884 579
885 /* 580 /* If so configured, complain about tasks blocking the grace period. */
886 * Now rat on any tasks that got kicked up to the root rcu_node
887 * due to CPU offlining.
888 */
889 rnp = rcu_get_root(rsp);
890 raw_spin_lock_irqsave(&rnp->lock, flags);
891 ndetected += rcu_print_task_stall(rnp);
892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
893
894 print_cpu_stall_info_end();
895 for_each_possible_cpu(cpu)
896 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
897 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
898 smp_processor_id(), (long)(jiffies - rsp->gp_start),
899 rsp->gpnum, rsp->completed, totqlen);
900 if (ndetected == 0)
901 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
902 else if (!trigger_all_cpu_backtrace())
903 rcu_dump_cpu_stacks(rsp);
904
905 /* Complain about tasks blocking the grace period. */
906 581
907 rcu_print_detail_task_stall(rsp); 582 rcu_print_detail_task_stall(rsp);
908 583
909 force_quiescent_state(rsp); /* Kick them all. */ 584 force_quiescent_state(rsp, 0); /* Kick them all. */
910} 585}
911 586
912static void print_cpu_stall(struct rcu_state *rsp) 587static void print_cpu_stall(struct rcu_state *rsp)
913{ 588{
914 int cpu;
915 unsigned long flags; 589 unsigned long flags;
916 struct rcu_node *rnp = rcu_get_root(rsp); 590 struct rcu_node *rnp = rcu_get_root(rsp);
917 long totqlen = 0;
918 591
919 /* 592 /*
920 * OK, time to rat on ourselves... 593 * OK, time to rat on ourselves...
921 * See Documentation/RCU/stallwarn.txt for info on how to debug 594 * See Documentation/RCU/stallwarn.txt for info on how to debug
922 * RCU CPU stall warnings. 595 * RCU CPU stall warnings.
923 */ 596 */
924 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); 597 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
925 print_cpu_stall_info_begin(); 598 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
926 print_cpu_stall_info(rsp, smp_processor_id()); 599 trigger_all_cpu_backtrace();
927 print_cpu_stall_info_end();
928 for_each_possible_cpu(cpu)
929 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
930 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
931 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
932 if (!trigger_all_cpu_backtrace())
933 dump_stack();
934 600
935 raw_spin_lock_irqsave(&rnp->lock, flags); 601 raw_spin_lock_irqsave(&rnp->lock, flags);
936 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 602 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
937 rsp->jiffies_stall = jiffies + 603 rsp->jiffies_stall =
938 3 * jiffies_till_stall_check() + 3; 604 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
939 raw_spin_unlock_irqrestore(&rnp->lock, flags); 605 raw_spin_unlock_irqrestore(&rnp->lock, flags);
940 606
941 set_need_resched(); /* kick ourselves to get things going. */ 607 set_need_resched(); /* kick ourselves to get things going. */
@@ -952,8 +618,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
952 j = ACCESS_ONCE(jiffies); 618 j = ACCESS_ONCE(jiffies);
953 js = ACCESS_ONCE(rsp->jiffies_stall); 619 js = ACCESS_ONCE(rsp->jiffies_stall);
954 rnp = rdp->mynode; 620 rnp = rdp->mynode;
955 if (rcu_gp_in_progress(rsp) && 621 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
956 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
957 622
958 /* We haven't checked in, so go dump stack. */ 623 /* We haven't checked in, so go dump stack. */
959 print_cpu_stall(rsp); 624 print_cpu_stall(rsp);
@@ -983,10 +648,9 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
983 */ 648 */
984void rcu_cpu_stall_reset(void) 649void rcu_cpu_stall_reset(void)
985{ 650{
986 struct rcu_state *rsp; 651 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
987 652 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
988 for_each_rcu_flavor(rsp) 653 rcu_preempt_stall_reset();
989 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
990} 654}
991 655
992static struct notifier_block rcu_panic_block = { 656static struct notifier_block rcu_panic_block = {
@@ -1014,10 +678,11 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
1014 * go looking for one. 678 * go looking for one.
1015 */ 679 */
1016 rdp->gpnum = rnp->gpnum; 680 rdp->gpnum = rnp->gpnum;
1017 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 681 if (rnp->qsmask & rdp->grpmask) {
1018 rdp->passed_quiesce = 0; 682 rdp->qs_pending = 1;
1019 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 683 rdp->passed_quiesc = 0;
1020 zero_cpu_stall_ticks(rdp); 684 } else
685 rdp->qs_pending = 0;
1021 } 686 }
1022} 687}
1023 688
@@ -1058,19 +723,6 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
1058} 723}
1059 724
1060/* 725/*
1061 * Initialize the specified rcu_data structure's callback list to empty.
1062 */
1063static void init_callback_list(struct rcu_data *rdp)
1064{
1065 int i;
1066
1067 rdp->nxtlist = NULL;
1068 for (i = 0; i < RCU_NEXT_SIZE; i++)
1069 rdp->nxttail[i] = &rdp->nxtlist;
1070 init_nocb_callback_list(rdp);
1071}
1072
1073/*
1074 * Advance this CPU's callbacks, but only if the current grace period 726 * Advance this CPU's callbacks, but only if the current grace period
1075 * has ended. This may be called only from the CPU to whom the rdp 727 * has ended. This may be called only from the CPU to whom the rdp
1076 * belongs. In addition, the corresponding leaf rcu_node structure's 728 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1089,7 +741,6 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1089 741
1090 /* Remember that we saw this grace-period completion. */ 742 /* Remember that we saw this grace-period completion. */
1091 rdp->completed = rnp->completed; 743 rdp->completed = rnp->completed;
1092 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1093 744
1094 /* 745 /*
1095 * If we were in an extended quiescent state, we may have 746 * If we were in an extended quiescent state, we may have
@@ -1097,13 +748,10 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1097 * our behalf. Catch up with this state to avoid noting 748 * our behalf. Catch up with this state to avoid noting
1098 * spurious new grace periods. If another grace period 749 * spurious new grace periods. If another grace period
1099 * has started, then rnp->gpnum will have advanced, so 750 * has started, then rnp->gpnum will have advanced, so
1100 * we will detect this later on. Of course, any quiescent 751 * we will detect this later on.
1101 * states we found for the old GP are now invalid.
1102 */ 752 */
1103 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { 753 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
1104 rdp->gpnum = rdp->completed; 754 rdp->gpnum = rdp->completed;
1105 rdp->passed_quiesce = 0;
1106 }
1107 755
1108 /* 756 /*
1109 * If RCU does not need a quiescent state from this CPU, 757 * If RCU does not need a quiescent state from this CPU,
@@ -1147,272 +795,120 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1147 /* Prior grace period ended, so advance callbacks for current CPU. */ 795 /* Prior grace period ended, so advance callbacks for current CPU. */
1148 __rcu_process_gp_end(rsp, rnp, rdp); 796 __rcu_process_gp_end(rsp, rnp, rdp);
1149 797
798 /*
799 * Because this CPU just now started the new grace period, we know
800 * that all of its callbacks will be covered by this upcoming grace
801 * period, even the ones that were registered arbitrarily recently.
802 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
803 *
804 * Other CPUs cannot be sure exactly when the grace period started.
805 * Therefore, their recently registered callbacks must pass through
806 * an additional RCU_NEXT_READY stage, so that they will be handled
807 * by the next RCU grace period.
808 */
809 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
810 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
811
1150 /* Set state so that this CPU will detect the next quiescent state. */ 812 /* Set state so that this CPU will detect the next quiescent state. */
1151 __note_new_gpnum(rsp, rnp, rdp); 813 __note_new_gpnum(rsp, rnp, rdp);
1152} 814}
1153 815
1154/* 816/*
1155 * Initialize a new grace period. 817 * Start a new RCU grace period if warranted, re-initializing the hierarchy
818 * in preparation for detecting the next grace period. The caller must hold
819 * the root node's ->lock, which is released before return. Hard irqs must
820 * be disabled.
1156 */ 821 */
1157static int rcu_gp_init(struct rcu_state *rsp) 822static void
823rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
824 __releases(rcu_get_root(rsp)->lock)
1158{ 825{
1159 struct rcu_data *rdp; 826 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1160 struct rcu_node *rnp = rcu_get_root(rsp); 827 struct rcu_node *rnp = rcu_get_root(rsp);
1161 828
1162 raw_spin_lock_irq(&rnp->lock); 829 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
1163 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 830 if (cpu_needs_another_gp(rsp, rdp))
831 rsp->fqs_need_gp = 1;
832 if (rnp->completed == rsp->completed) {
833 raw_spin_unlock_irqrestore(&rnp->lock, flags);
834 return;
835 }
836 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1164 837
1165 if (rcu_gp_in_progress(rsp)) { 838 /*
1166 /* Grace period already in progress, don't start another. */ 839 * Propagate new ->completed value to rcu_node structures
1167 raw_spin_unlock_irq(&rnp->lock); 840 * so that other CPUs don't have to wait until the start
1168 return 0; 841 * of the next grace period to process their callbacks.
842 */
843 rcu_for_each_node_breadth_first(rsp, rnp) {
844 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
845 rnp->completed = rsp->completed;
846 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
847 }
848 local_irq_restore(flags);
849 return;
1169 } 850 }
1170 851
1171 /* Advance to a new grace period and initialize state. */ 852 /* Advance to a new grace period and initialize state. */
1172 rsp->gpnum++; 853 rsp->gpnum++;
1173 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 854 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
855 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
856 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1174 record_gp_stall_check_time(rsp); 857 record_gp_stall_check_time(rsp);
1175 raw_spin_unlock_irq(&rnp->lock); 858
859 /* Special-case the common single-level case. */
860 if (NUM_RCU_NODES == 1) {
861 rcu_preempt_check_blocked_tasks(rnp);
862 rnp->qsmask = rnp->qsmaskinit;
863 rnp->gpnum = rsp->gpnum;
864 rnp->completed = rsp->completed;
865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
866 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp);
868 raw_spin_unlock_irqrestore(&rnp->lock, flags);
869 return;
870 }
871
872 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
873
1176 874
1177 /* Exclude any concurrent CPU-hotplug operations. */ 875 /* Exclude any concurrent CPU-hotplug operations. */
1178 mutex_lock(&rsp->onoff_mutex); 876 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1179 877
1180 /* 878 /*
1181 * Set the quiescent-state-needed bits in all the rcu_node 879 * Set the quiescent-state-needed bits in all the rcu_node
1182 * structures for all currently online CPUs in breadth-first order, 880 * structures for all currently online CPUs in breadth-first
1183 * starting from the root rcu_node structure, relying on the layout 881 * order, starting from the root rcu_node structure. This
1184 * of the tree within the rsp->node[] array. Note that other CPUs 882 * operation relies on the layout of the hierarchy within the
1185 * will access only the leaves of the hierarchy, thus seeing that no 883 * rsp->node[] array. Note that other CPUs will access only
884 * the leaves of the hierarchy, which still indicate that no
1186 * grace period is in progress, at least until the corresponding 885 * grace period is in progress, at least until the corresponding
1187 * leaf node has been initialized. In addition, we have excluded 886 * leaf node has been initialized. In addition, we have excluded
1188 * CPU-hotplug operations. 887 * CPU-hotplug operations.
1189 * 888 *
1190 * The grace period cannot complete until the initialization 889 * Note that the grace period cannot complete until we finish
1191 * process finishes, because this kthread handles both. 890 * the initialization process, as there will be at least one
891 * qsmask bit set in the root node until that time, namely the
892 * one corresponding to this CPU, due to the fact that we have
893 * irqs disabled.
1192 */ 894 */
1193 rcu_for_each_node_breadth_first(rsp, rnp) { 895 rcu_for_each_node_breadth_first(rsp, rnp) {
1194 raw_spin_lock_irq(&rnp->lock); 896 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1195 rdp = this_cpu_ptr(rsp->rda);
1196 rcu_preempt_check_blocked_tasks(rnp); 897 rcu_preempt_check_blocked_tasks(rnp);
1197 rnp->qsmask = rnp->qsmaskinit; 898 rnp->qsmask = rnp->qsmaskinit;
1198 rnp->gpnum = rsp->gpnum; 899 rnp->gpnum = rsp->gpnum;
1199 WARN_ON_ONCE(rnp->completed != rsp->completed);
1200 rnp->completed = rsp->completed; 900 rnp->completed = rsp->completed;
1201 if (rnp == rdp->mynode) 901 if (rnp == rdp->mynode)
1202 rcu_start_gp_per_cpu(rsp, rnp, rdp); 902 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1203 rcu_preempt_boost_start_gp(rnp); 903 rcu_preempt_boost_start_gp(rnp);
1204 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1205 rnp->level, rnp->grplo,
1206 rnp->grphi, rnp->qsmask);
1207 raw_spin_unlock_irq(&rnp->lock);
1208#ifdef CONFIG_PROVE_RCU_DELAY
1209 if ((random32() % (rcu_num_nodes * 8)) == 0)
1210 schedule_timeout_uninterruptible(2);
1211#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1212 cond_resched();
1213 } 905 }
1214 906
1215 mutex_unlock(&rsp->onoff_mutex);
1216 return 1;
1217}
1218
1219/*
1220 * Do one round of quiescent-state forcing.
1221 */
1222int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1223{
1224 int fqs_state = fqs_state_in;
1225 struct rcu_node *rnp = rcu_get_root(rsp);
1226
1227 rsp->n_force_qs++;
1228 if (fqs_state == RCU_SAVE_DYNTICK) {
1229 /* Collect dyntick-idle snapshots. */
1230 force_qs_rnp(rsp, dyntick_save_progress_counter);
1231 fqs_state = RCU_FORCE_QS;
1232 } else {
1233 /* Handle dyntick-idle and offline CPUs. */
1234 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1235 }
1236 /* Clear flag to prevent immediate re-entry. */
1237 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1238 raw_spin_lock_irq(&rnp->lock);
1239 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1240 raw_spin_unlock_irq(&rnp->lock);
1241 }
1242 return fqs_state;
1243}
1244
1245/*
1246 * Clean up after the old grace period.
1247 */
1248static void rcu_gp_cleanup(struct rcu_state *rsp)
1249{
1250 unsigned long gp_duration;
1251 struct rcu_data *rdp;
1252 struct rcu_node *rnp = rcu_get_root(rsp);
1253
1254 raw_spin_lock_irq(&rnp->lock);
1255 gp_duration = jiffies - rsp->gp_start;
1256 if (gp_duration > rsp->gp_max)
1257 rsp->gp_max = gp_duration;
1258
1259 /*
1260 * We know the grace period is complete, but to everyone else
1261 * it appears to still be ongoing. But it is also the case
1262 * that to everyone else it looks like there is nothing that
1263 * they can do to advance the grace period. It is therefore
1264 * safe for us to drop the lock in order to mark the grace
1265 * period as completed in all of the rcu_node structures.
1266 */
1267 raw_spin_unlock_irq(&rnp->lock);
1268
1269 /*
1270 * Propagate new ->completed value to rcu_node structures so
1271 * that other CPUs don't have to wait until the start of the next
1272 * grace period to process their callbacks. This also avoids
1273 * some nasty RCU grace-period initialization races by forcing
1274 * the end of the current grace period to be completely recorded in
1275 * all of the rcu_node structures before the beginning of the next
1276 * grace period is recorded in any of the rcu_node structures.
1277 */
1278 rcu_for_each_node_breadth_first(rsp, rnp) {
1279 raw_spin_lock_irq(&rnp->lock);
1280 rnp->completed = rsp->gpnum;
1281 raw_spin_unlock_irq(&rnp->lock);
1282 cond_resched();
1283 }
1284 rnp = rcu_get_root(rsp); 907 rnp = rcu_get_root(rsp);
1285 raw_spin_lock_irq(&rnp->lock); 908 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1286 909 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
1287 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 910 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1288 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 911 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1289 rsp->fqs_state = RCU_GP_IDLE;
1290 rdp = this_cpu_ptr(rsp->rda);
1291 if (cpu_needs_another_gp(rsp, rdp))
1292 rsp->gp_flags = 1;
1293 raw_spin_unlock_irq(&rnp->lock);
1294}
1295
1296/*
1297 * Body of kthread that handles grace periods.
1298 */
1299static int __noreturn rcu_gp_kthread(void *arg)
1300{
1301 int fqs_state;
1302 unsigned long j;
1303 int ret;
1304 struct rcu_state *rsp = arg;
1305 struct rcu_node *rnp = rcu_get_root(rsp);
1306
1307 for (;;) {
1308
1309 /* Handle grace-period start. */
1310 for (;;) {
1311 wait_event_interruptible(rsp->gp_wq,
1312 rsp->gp_flags &
1313 RCU_GP_FLAG_INIT);
1314 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
1315 rcu_gp_init(rsp))
1316 break;
1317 cond_resched();
1318 flush_signals(current);
1319 }
1320
1321 /* Handle quiescent-state forcing. */
1322 fqs_state = RCU_SAVE_DYNTICK;
1323 j = jiffies_till_first_fqs;
1324 if (j > HZ) {
1325 j = HZ;
1326 jiffies_till_first_fqs = HZ;
1327 }
1328 for (;;) {
1329 rsp->jiffies_force_qs = jiffies + j;
1330 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1331 (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
1332 (!ACCESS_ONCE(rnp->qsmask) &&
1333 !rcu_preempt_blocked_readers_cgp(rnp)),
1334 j);
1335 /* If grace period done, leave loop. */
1336 if (!ACCESS_ONCE(rnp->qsmask) &&
1337 !rcu_preempt_blocked_readers_cgp(rnp))
1338 break;
1339 /* If time for quiescent-state forcing, do it. */
1340 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
1341 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1342 cond_resched();
1343 } else {
1344 /* Deal with stray signal. */
1345 cond_resched();
1346 flush_signals(current);
1347 }
1348 j = jiffies_till_next_fqs;
1349 if (j > HZ) {
1350 j = HZ;
1351 jiffies_till_next_fqs = HZ;
1352 } else if (j < 1) {
1353 j = 1;
1354 jiffies_till_next_fqs = 1;
1355 }
1356 }
1357
1358 /* Handle grace-period end. */
1359 rcu_gp_cleanup(rsp);
1360 }
1361}
1362
1363/*
1364 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1365 * in preparation for detecting the next grace period. The caller must hold
1366 * the root node's ->lock, which is released before return. Hard irqs must
1367 * be disabled.
1368 *
1369 * Note that it is legal for a dying CPU (which is marked as offline) to
1370 * invoke this function. This can happen when the dying CPU reports its
1371 * quiescent state.
1372 */
1373static void
1374rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1375 __releases(rcu_get_root(rsp)->lock)
1376{
1377 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1378 struct rcu_node *rnp = rcu_get_root(rsp);
1379
1380 if (!rsp->gp_kthread ||
1381 !cpu_needs_another_gp(rsp, rdp)) {
1382 /*
1383 * Either we have not yet spawned the grace-period
1384 * task, this CPU does not need another grace period,
1385 * or a grace period is already in progress.
1386 * Either way, don't start a new grace period.
1387 */
1388 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1389 return;
1390 }
1391
1392 /*
1393 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be
1396 * handled after the end of the next grace period. If the
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406
1407 rsp->gp_flags = RCU_GP_FLAG_INIT;
1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1409
1410 /* Ensure that CPU is aware of completion of last grace period. */
1411 rcu_process_gp_end(rsp, rdp);
1412 local_irq_restore(flags);
1413
1414 /* Wake up rcu_gp_kthread() to start the grace period. */
1415 wake_up(&rsp->gp_wq);
1416} 912}
1417 913
1418/* 914/*
@@ -1425,9 +921,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1425static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 921static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1426 __releases(rcu_get_root(rsp)->lock) 922 __releases(rcu_get_root(rsp)->lock)
1427{ 923{
924 unsigned long gp_duration;
925
1428 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1429 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 927
1430 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 928 /*
929 * Ensure that all grace-period and pre-grace-period activity
930 * is seen before the assignment to rsp->completed.
931 */
932 smp_mb(); /* See above block comment. */
933 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration;
936 rsp->completed = rsp->gpnum;
937 rsp->signaled = RCU_GP_IDLE;
938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
1431} 939}
1432 940
1433/* 941/*
@@ -1454,10 +962,6 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1454 return; 962 return;
1455 } 963 }
1456 rnp->qsmask &= ~mask; 964 rnp->qsmask &= ~mask;
1457 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
1458 mask, rnp->qsmask, rnp->level,
1459 rnp->grplo, rnp->grphi,
1460 !!rnp->gp_tasks);
1461 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
1462 966
1463 /* Other bits still set at this level, so done. */ 967 /* Other bits still set at this level, so done. */
@@ -1496,7 +1000,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1496 * based on quiescent states detected in an earlier grace period! 1000 * based on quiescent states detected in an earlier grace period!
1497 */ 1001 */
1498static void 1002static void
1499rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 1003rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
1500{ 1004{
1501 unsigned long flags; 1005 unsigned long flags;
1502 unsigned long mask; 1006 unsigned long mask;
@@ -1504,16 +1008,17 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1504 1008
1505 rnp = rdp->mynode; 1009 rnp = rdp->mynode;
1506 raw_spin_lock_irqsave(&rnp->lock, flags); 1010 raw_spin_lock_irqsave(&rnp->lock, flags);
1507 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 1011 if (lastcomp != rnp->completed) {
1508 rnp->completed == rnp->gpnum) {
1509 1012
1510 /* 1013 /*
1511 * The grace period in which this quiescent state was 1014 * Someone beat us to it for this grace period, so leave.
1512 * recorded has ended, so don't report it upwards. 1015 * The race with GP start is resolved by the fact that we
1513 * We will instead need a new quiescent state that lies 1016 * hold the leaf rcu_node lock, so that the per-CPU bits
1514 * within the current grace period. 1017 * cannot yet be initialized -- so we would simply find our
1018 * CPU's bit already cleared in rcu_report_qs_rnp() if this
1019 * race occurred.
1515 */ 1020 */
1516 rdp->passed_quiesce = 0; /* need qs for new gp. */ 1021 rdp->passed_quiesc = 0; /* try again later! */
1517 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1022 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1518 return; 1023 return;
1519 } 1024 }
@@ -1557,165 +1062,67 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1557 * Was there a quiescent state since the beginning of the grace 1062 * Was there a quiescent state since the beginning of the grace
1558 * period? If no, then exit and wait for the next call. 1063 * period? If no, then exit and wait for the next call.
1559 */ 1064 */
1560 if (!rdp->passed_quiesce) 1065 if (!rdp->passed_quiesc)
1561 return; 1066 return;
1562 1067
1563 /* 1068 /*
1564 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1069 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1565 * judge of that). 1070 * judge of that).
1566 */ 1071 */
1567 rcu_report_qs_rdp(rdp->cpu, rsp, rdp); 1072 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
1568} 1073}
1569 1074
1570#ifdef CONFIG_HOTPLUG_CPU 1075#ifdef CONFIG_HOTPLUG_CPU
1571 1076
1572/* 1077/*
1573 * Send the specified CPU's RCU callbacks to the orphanage. The 1078 * Move a dying CPU's RCU callbacks to online CPU's callback list.
1574 * specified CPU must be offline, and the caller must hold the 1079 * Synchronization is not required because this function executes
1575 * ->orphan_lock. 1080 * in stop_machine() context.
1576 */ 1081 */
1577static void 1082static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1578rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1579 struct rcu_node *rnp, struct rcu_data *rdp)
1580{
1581 /* No-CBs CPUs do not have orphanable callbacks. */
1582 if (is_nocb_cpu(rdp->cpu))
1583 return;
1584
1585 /*
1586 * Orphan the callbacks. First adjust the counts. This is safe
1587 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1588 * cannot be running now. Thus no memory barrier is required.
1589 */
1590 if (rdp->nxtlist != NULL) {
1591 rsp->qlen_lazy += rdp->qlen_lazy;
1592 rsp->qlen += rdp->qlen;
1593 rdp->n_cbs_orphaned += rdp->qlen;
1594 rdp->qlen_lazy = 0;
1595 ACCESS_ONCE(rdp->qlen) = 0;
1596 }
1597
1598 /*
1599 * Next, move those callbacks still needing a grace period to
1600 * the orphanage, where some other CPU will pick them up.
1601 * Some of the callbacks might have gone partway through a grace
1602 * period, but that is too bad. They get to start over because we
1603 * cannot assume that grace periods are synchronized across CPUs.
1604 * We don't bother updating the ->nxttail[] array yet, instead
1605 * we just reset the whole thing later on.
1606 */
1607 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1608 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1609 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1610 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1611 }
1612
1613 /*
1614 * Then move the ready-to-invoke callbacks to the orphanage,
1615 * where some other CPU will pick them up. These will not be
1616 * required to pass though another grace period: They are done.
1617 */
1618 if (rdp->nxtlist != NULL) {
1619 *rsp->orphan_donetail = rdp->nxtlist;
1620 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1621 }
1622
1623 /* Finally, initialize the rcu_data structure's list to empty. */
1624 init_callback_list(rdp);
1625}
1626
1627/*
1628 * Adopt the RCU callbacks from the specified rcu_state structure's
1629 * orphanage. The caller must hold the ->orphan_lock.
1630 */
1631static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1632{ 1083{
1633 int i; 1084 int i;
1634 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1085 /* current DYING CPU is cleared in the cpu_online_mask */
1635 1086 int receive_cpu = cpumask_any(cpu_online_mask);
1636 /* No-CBs CPUs are handled specially. */ 1087 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1637 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) 1088 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1638 return;
1639
1640 /* Do the accounting first. */
1641 rdp->qlen_lazy += rsp->qlen_lazy;
1642 rdp->qlen += rsp->qlen;
1643 rdp->n_cbs_adopted += rsp->qlen;
1644 if (rsp->qlen_lazy != rsp->qlen)
1645 rcu_idle_count_callbacks_posted();
1646 rsp->qlen_lazy = 0;
1647 rsp->qlen = 0;
1648
1649 /*
1650 * We do not need a memory barrier here because the only way we
1651 * can get here if there is an rcu_barrier() in flight is if
1652 * we are the task doing the rcu_barrier().
1653 */
1654
1655 /* First adopt the ready-to-invoke callbacks. */
1656 if (rsp->orphan_donelist != NULL) {
1657 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1658 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1659 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1660 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1661 rdp->nxttail[i] = rsp->orphan_donetail;
1662 rsp->orphan_donelist = NULL;
1663 rsp->orphan_donetail = &rsp->orphan_donelist;
1664 }
1665 1089
1666 /* And then adopt the callbacks that still need a grace period. */ 1090 if (rdp->nxtlist == NULL)
1667 if (rsp->orphan_nxtlist != NULL) { 1091 return; /* irqs disabled, so comparison is stable. */
1668 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1669 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1670 rsp->orphan_nxtlist = NULL;
1671 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1672 }
1673}
1674 1092
1675/* 1093 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1676 * Trace the fact that this CPU is going offline. 1094 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1677 */ 1095 receive_rdp->qlen += rdp->qlen;
1678static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1096 receive_rdp->n_cbs_adopted += rdp->qlen;
1679{ 1097 rdp->n_cbs_orphaned += rdp->qlen;
1680 RCU_TRACE(unsigned long mask);
1681 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1682 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1683 1098
1684 RCU_TRACE(mask = rdp->grpmask); 1099 rdp->nxtlist = NULL;
1685 trace_rcu_grace_period(rsp->name, 1100 for (i = 0; i < RCU_NEXT_SIZE; i++)
1686 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1101 rdp->nxttail[i] = &rdp->nxtlist;
1687 "cpuofl"); 1102 rdp->qlen = 0;
1688} 1103}
1689 1104
1690/* 1105/*
1691 * The CPU has been completely removed, and some other CPU is reporting 1106 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1692 * this fact from process context. Do the remainder of the cleanup, 1107 * and move all callbacks from the outgoing CPU to the current one.
1693 * including orphaning the outgoing CPU's RCU callbacks, and also 1108 * There can only be one CPU hotplug operation at a time, so no other
1694 * adopting them. There can only be one CPU hotplug operation at a time, 1109 * CPU can be attempting to update rcu_cpu_kthread_task.
1695 * so no other CPU can be attempting to update rcu_cpu_kthread_task.
1696 */ 1110 */
1697static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 1111static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1698{ 1112{
1699 unsigned long flags; 1113 unsigned long flags;
1700 unsigned long mask; 1114 unsigned long mask;
1701 int need_report = 0; 1115 int need_report = 0;
1702 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1116 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1703 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 1117 struct rcu_node *rnp;
1704
1705 /* Adjust any no-longer-needed kthreads. */
1706 rcu_boost_kthread_setaffinity(rnp, -1);
1707 1118
1708 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1119 rcu_stop_cpu_kthread(cpu);
1709 1120
1710 /* Exclude any attempts to start a new grace period. */ 1121 /* Exclude any attempts to start a new grace period. */
1711 mutex_lock(&rsp->onoff_mutex); 1122 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1712 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1713
1714 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1715 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1716 rcu_adopt_orphan_cbs(rsp);
1717 1123
1718 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1124 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1125 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
1719 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1126 mask = rdp->grpmask; /* rnp->grplo is constant. */
1720 do { 1127 do {
1721 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1128 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -1736,33 +1143,40 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1736 /* 1143 /*
1737 * We still hold the leaf rcu_node structure lock here, and 1144 * We still hold the leaf rcu_node structure lock here, and
1738 * irqs are still disabled. The reason for this subterfuge is 1145 * irqs are still disabled. The reason for this subterfuge is
1739 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock 1146 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
1740 * held leads to deadlock. 1147 * held leads to deadlock.
1741 */ 1148 */
1742 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ 1149 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1743 rnp = rdp->mynode; 1150 rnp = rdp->mynode;
1744 if (need_report & RCU_OFL_TASKS_NORM_GP) 1151 if (need_report & RCU_OFL_TASKS_NORM_GP)
1745 rcu_report_unblock_qs_rnp(rnp, flags); 1152 rcu_report_unblock_qs_rnp(rnp, flags);
1746 else 1153 else
1747 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1154 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1748 if (need_report & RCU_OFL_TASKS_EXP_GP) 1155 if (need_report & RCU_OFL_TASKS_EXP_GP)
1749 rcu_report_exp_rnp(rsp, rnp, true); 1156 rcu_report_exp_rnp(rsp, rnp);
1750 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 1157 rcu_node_kthread_setaffinity(rnp, -1);
1751 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 1158}
1752 cpu, rdp->qlen, rdp->nxtlist); 1159
1753 init_callback_list(rdp); 1160/*
1754 /* Disallow further callbacks on this CPU. */ 1161 * Remove the specified CPU from the RCU hierarchy and move any pending
1755 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 1162 * callbacks that it might have to the current CPU. This code assumes
1756 mutex_unlock(&rsp->onoff_mutex); 1163 * that at least one CPU in the system will remain running at all times.
1164 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
1165 */
1166static void rcu_offline_cpu(int cpu)
1167{
1168 __rcu_offline_cpu(cpu, &rcu_sched_state);
1169 __rcu_offline_cpu(cpu, &rcu_bh_state);
1170 rcu_preempt_offline_cpu(cpu);
1757} 1171}
1758 1172
1759#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1173#else /* #ifdef CONFIG_HOTPLUG_CPU */
1760 1174
1761static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1175static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1762{ 1176{
1763} 1177}
1764 1178
1765static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 1179static void rcu_offline_cpu(int cpu)
1766{ 1180{
1767} 1181}
1768 1182
@@ -1776,70 +1190,52 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1776{ 1190{
1777 unsigned long flags; 1191 unsigned long flags;
1778 struct rcu_head *next, *list, **tail; 1192 struct rcu_head *next, *list, **tail;
1779 long bl, count, count_lazy; 1193 int count;
1780 int i;
1781 1194
1782 /* If no callbacks are ready, just return.*/ 1195 /* If no callbacks are ready, just return.*/
1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1196 if (!cpu_has_callbacks_ready_to_invoke(rdp))
1784 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1785 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1786 need_resched(), is_idle_task(current),
1787 rcu_is_callbacks_kthread());
1788 return; 1197 return;
1789 }
1790 1198
1791 /* 1199 /*
1792 * Extract the list of ready callbacks, disabling to prevent 1200 * Extract the list of ready callbacks, disabling to prevent
1793 * races with call_rcu() from interrupt handlers. 1201 * races with call_rcu() from interrupt handlers.
1794 */ 1202 */
1795 local_irq_save(flags); 1203 local_irq_save(flags);
1796 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1797 bl = rdp->blimit;
1798 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
1799 list = rdp->nxtlist; 1204 list = rdp->nxtlist;
1800 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1205 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1801 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1206 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1802 tail = rdp->nxttail[RCU_DONE_TAIL]; 1207 tail = rdp->nxttail[RCU_DONE_TAIL];
1803 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) 1208 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
1804 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 1209 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
1805 rdp->nxttail[i] = &rdp->nxtlist; 1210 rdp->nxttail[count] = &rdp->nxtlist;
1806 local_irq_restore(flags); 1211 local_irq_restore(flags);
1807 1212
1808 /* Invoke callbacks. */ 1213 /* Invoke callbacks. */
1809 count = count_lazy = 0; 1214 count = 0;
1810 while (list) { 1215 while (list) {
1811 next = list->next; 1216 next = list->next;
1812 prefetch(next); 1217 prefetch(next);
1813 debug_rcu_head_unqueue(list); 1218 debug_rcu_head_unqueue(list);
1814 if (__rcu_reclaim(rsp->name, list)) 1219 __rcu_reclaim(list);
1815 count_lazy++;
1816 list = next; 1220 list = next;
1817 /* Stop only if limit reached and CPU has something to do. */ 1221 if (++count >= rdp->blimit)
1818 if (++count >= bl &&
1819 (need_resched() ||
1820 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1821 break; 1222 break;
1822 } 1223 }
1823 1224
1824 local_irq_save(flags); 1225 local_irq_save(flags);
1825 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1826 is_idle_task(current),
1827 rcu_is_callbacks_kthread());
1828 1226
1829 /* Update count, and requeue any remaining callbacks. */ 1227 /* Update count, and requeue any remaining callbacks. */
1228 rdp->qlen -= count;
1229 rdp->n_cbs_invoked += count;
1830 if (list != NULL) { 1230 if (list != NULL) {
1831 *tail = rdp->nxtlist; 1231 *tail = rdp->nxtlist;
1832 rdp->nxtlist = list; 1232 rdp->nxtlist = list;
1833 for (i = 0; i < RCU_NEXT_SIZE; i++) 1233 for (count = 0; count < RCU_NEXT_SIZE; count++)
1834 if (&rdp->nxtlist == rdp->nxttail[i]) 1234 if (&rdp->nxtlist == rdp->nxttail[count])
1835 rdp->nxttail[i] = tail; 1235 rdp->nxttail[count] = tail;
1836 else 1236 else
1837 break; 1237 break;
1838 } 1238 }
1839 smp_mb(); /* List handling before counting for rcu_barrier(). */
1840 rdp->qlen_lazy -= count_lazy;
1841 ACCESS_ONCE(rdp->qlen) -= count;
1842 rdp->n_cbs_invoked += count;
1843 1239
1844 /* Reinstate batch limit if we have worked down the excess. */ 1240 /* Reinstate batch limit if we have worked down the excess. */
1845 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1241 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1851,11 +1247,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1851 rdp->n_force_qs_snap = rsp->n_force_qs; 1247 rdp->n_force_qs_snap = rsp->n_force_qs;
1852 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 1248 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1853 rdp->qlen_last_fqs_check = rdp->qlen; 1249 rdp->qlen_last_fqs_check = rdp->qlen;
1854 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
1855 1250
1856 local_irq_restore(flags); 1251 local_irq_restore(flags);
1857 1252
1858 /* Re-invoke RCU core processing if there are callbacks remaining. */ 1253 /* Re-raise the RCU softirq if there are callbacks remaining. */
1859 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1254 if (cpu_has_callbacks_ready_to_invoke(rdp))
1860 invoke_rcu_core(); 1255 invoke_rcu_core();
1861} 1256}
@@ -1863,17 +1258,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1863/* 1258/*
1864 * Check to see if this CPU is in a non-context-switch quiescent state 1259 * Check to see if this CPU is in a non-context-switch quiescent state
1865 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1260 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1866 * Also schedule RCU core processing. 1261 * Also schedule the RCU softirq handler.
1867 * 1262 *
1868 * This function must be called from hardirq context. It is normally 1263 * This function must be called with hardirqs disabled. It is normally
1869 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1264 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1870 * false, there is no point in invoking rcu_check_callbacks(). 1265 * false, there is no point in invoking rcu_check_callbacks().
1871 */ 1266 */
1872void rcu_check_callbacks(int cpu, int user) 1267void rcu_check_callbacks(int cpu, int user)
1873{ 1268{
1874 trace_rcu_utilization("Start scheduler-tick"); 1269 if (user ||
1875 increment_cpu_stall_ticks(); 1270 (idle_cpu(cpu) && rcu_scheduler_active &&
1876 if (user || rcu_is_cpu_rrupt_from_idle()) { 1271 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1877 1272
1878 /* 1273 /*
1879 * Get here if this CPU took its interrupt from user 1274 * Get here if this CPU took its interrupt from user
@@ -1904,9 +1299,10 @@ void rcu_check_callbacks(int cpu, int user)
1904 rcu_preempt_check_callbacks(cpu); 1299 rcu_preempt_check_callbacks(cpu);
1905 if (rcu_pending(cpu)) 1300 if (rcu_pending(cpu))
1906 invoke_rcu_core(); 1301 invoke_rcu_core();
1907 trace_rcu_utilization("End scheduler-tick");
1908} 1302}
1909 1303
1304#ifdef CONFIG_SMP
1305
1910/* 1306/*
1911 * Scan the leaf rcu_node structures, processing dyntick state for any that 1307 * Scan the leaf rcu_node structures, processing dyntick state for any that
1912 * have not yet encountered a quiescent state, using the function specified. 1308 * have not yet encountered a quiescent state, using the function specified.
@@ -1923,7 +1319,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1923 struct rcu_node *rnp; 1319 struct rcu_node *rnp;
1924 1320
1925 rcu_for_each_leaf_node(rsp, rnp) { 1321 rcu_for_each_leaf_node(rsp, rnp) {
1926 cond_resched();
1927 mask = 0; 1322 mask = 0;
1928 raw_spin_lock_irqsave(&rnp->lock, flags); 1323 raw_spin_lock_irqsave(&rnp->lock, flags);
1929 if (!rcu_gp_in_progress(rsp)) { 1324 if (!rcu_gp_in_progress(rsp)) {
@@ -1960,55 +1355,99 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1960 * Force quiescent states on reluctant CPUs, and also detect which 1355 * Force quiescent states on reluctant CPUs, and also detect which
1961 * CPUs are in dyntick-idle mode. 1356 * CPUs are in dyntick-idle mode.
1962 */ 1357 */
1963static void force_quiescent_state(struct rcu_state *rsp) 1358static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1964{ 1359{
1965 unsigned long flags; 1360 unsigned long flags;
1966 bool ret; 1361 struct rcu_node *rnp = rcu_get_root(rsp);
1967 struct rcu_node *rnp; 1362
1968 struct rcu_node *rnp_old = NULL; 1363 if (!rcu_gp_in_progress(rsp))
1969 1364 return; /* No grace period in progress, nothing to force. */
1970 /* Funnel through hierarchy to reduce memory contention. */ 1365 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1971 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; 1366 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1972 for (; rnp != NULL; rnp = rnp->parent) { 1367 return; /* Someone else is already on the job. */
1973 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
1974 !raw_spin_trylock(&rnp->fqslock);
1975 if (rnp_old != NULL)
1976 raw_spin_unlock(&rnp_old->fqslock);
1977 if (ret) {
1978 rsp->n_force_qs_lh++;
1979 return;
1980 }
1981 rnp_old = rnp;
1982 } 1368 }
1983 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 1369 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1984 1370 goto unlock_fqs_ret; /* no emergency and done recently. */
1985 /* Reached the root of the rcu_node tree, acquire lock. */ 1371 rsp->n_force_qs++;
1986 raw_spin_lock_irqsave(&rnp_old->lock, flags); 1372 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1987 raw_spin_unlock(&rnp_old->fqslock); 1373 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1988 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1374 if(!rcu_gp_in_progress(rsp)) {
1989 rsp->n_force_qs_lh++; 1375 rsp->n_force_qs_ngp++;
1990 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 1376 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1991 return; /* Someone beat us to it. */ 1377 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1992 } 1378 }
1993 rsp->gp_flags |= RCU_GP_FLAG_FQS; 1379 rsp->fqs_active = 1;
1994 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 1380 switch (rsp->signaled) {
1995 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 1381 case RCU_GP_IDLE:
1382 case RCU_GP_INIT:
1383
1384 break; /* grace period idle or initializing, ignore. */
1385
1386 case RCU_SAVE_DYNTICK:
1387 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1388 break; /* So gcc recognizes the dead code. */
1389
1390 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1391
1392 /* Record dyntick-idle state. */
1393 force_qs_rnp(rsp, dyntick_save_progress_counter);
1394 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1395 if (rcu_gp_in_progress(rsp))
1396 rsp->signaled = RCU_FORCE_QS;
1397 break;
1398
1399 case RCU_FORCE_QS:
1400
1401 /* Check dyntick-idle state, send IPI to laggarts. */
1402 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1403 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1404
1405 /* Leave state in case more forcing is required. */
1406
1407 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1408 break;
1409 }
1410 rsp->fqs_active = 0;
1411 if (rsp->fqs_need_gp) {
1412 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1413 rsp->fqs_need_gp = 0;
1414 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1415 return;
1416 }
1417 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1418unlock_fqs_ret:
1419 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1420}
1421
1422#else /* #ifdef CONFIG_SMP */
1423
1424static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1425{
1426 set_need_resched();
1996} 1427}
1997 1428
1429#endif /* #else #ifdef CONFIG_SMP */
1430
1998/* 1431/*
1999 * This does the RCU core processing work for the specified rcu_state 1432 * This does the RCU processing work from softirq context for the
2000 * and rcu_data structures. This may be called only from the CPU to 1433 * specified rcu_state and rcu_data structures. This may be called
2001 * whom the rdp belongs. 1434 * only from the CPU to whom the rdp belongs.
2002 */ 1435 */
2003static void 1436static void
2004__rcu_process_callbacks(struct rcu_state *rsp) 1437__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2005{ 1438{
2006 unsigned long flags; 1439 unsigned long flags;
2007 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2008 1440
2009 WARN_ON_ONCE(rdp->beenonline == 0); 1441 WARN_ON_ONCE(rdp->beenonline == 0);
2010 1442
2011 /* 1443 /*
1444 * If an RCU GP has gone long enough, go check for dyntick
1445 * idle CPUs and, if needed, send resched IPIs.
1446 */
1447 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1448 force_quiescent_state(rsp, 1);
1449
1450 /*
2012 * Advance callbacks in response to end of earlier grace 1451 * Advance callbacks in response to end of earlier grace
2013 * period that some other CPU ended. 1452 * period that some other CPU ended.
2014 */ 1453 */
@@ -2029,26 +1468,24 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2029} 1468}
2030 1469
2031/* 1470/*
2032 * Do RCU core processing for the current CPU. 1471 * Do softirq processing for the current CPU.
2033 */ 1472 */
2034static void rcu_process_callbacks(struct softirq_action *unused) 1473static void rcu_process_callbacks(struct softirq_action *unused)
2035{ 1474{
2036 struct rcu_state *rsp; 1475 __rcu_process_callbacks(&rcu_sched_state,
1476 &__get_cpu_var(rcu_sched_data));
1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1478 rcu_preempt_process_callbacks();
2037 1479
2038 if (cpu_is_offline(smp_processor_id())) 1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
2039 return; 1481 rcu_needs_cpu_flush();
2040 trace_rcu_utilization("Start RCU core");
2041 for_each_rcu_flavor(rsp)
2042 __rcu_process_callbacks(rsp);
2043 trace_rcu_utilization("End RCU core");
2044} 1482}
2045 1483
2046/* 1484/*
2047 * Schedule RCU callback invocation. If the specified type of RCU 1485 * Wake up the current CPU's kthread. This replaces raise_softirq()
2048 * does not support RCU priority boosting, just do a direct call, 1486 * in earlier versions of RCU. Note that because we are running on
2049 * otherwise wake up the per-CPU kernel kthread. Note that because we 1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
2050 * are running on the current CPU with interrupts disabled, the 1488 * cannot disappear out from under us.
2051 * rcu_cpu_kthread_task cannot disappear out from under us.
2052 */ 1489 */
2053static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2054{ 1491{
@@ -2066,22 +1503,38 @@ static void invoke_rcu_core(void)
2066 raise_softirq(RCU_SOFTIRQ); 1503 raise_softirq(RCU_SOFTIRQ);
2067} 1504}
2068 1505
2069/* 1506static void
2070 * Handle any core-RCU processing required by a call_rcu() invocation. 1507__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2071 */ 1508 struct rcu_state *rsp)
2072static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2073 struct rcu_head *head, unsigned long flags)
2074{ 1509{
1510 unsigned long flags;
1511 struct rcu_data *rdp;
1512
1513 debug_rcu_head_queue(head);
1514 head->func = func;
1515 head->next = NULL;
1516
1517 smp_mb(); /* Ensure RCU update seen before callback registry. */
1518
2075 /* 1519 /*
2076 * If called from an extended quiescent state, invoke the RCU 1520 * Opportunistically note grace-period endings and beginnings.
2077 * core in order to force a re-evaluation of RCU's idleness. 1521 * Note that we might see a beginning right after we see an
1522 * end, but never vice versa, since this CPU has to pass through
1523 * a quiescent state betweentimes.
2078 */ 1524 */
2079 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) 1525 local_irq_save(flags);
2080 invoke_rcu_core(); 1526 rdp = this_cpu_ptr(rsp->rda);
1527
1528 /* Add the callback to our list. */
1529 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++;
2081 1532
2082 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 1533 /* If interrupts were disabled, don't dive into RCU core. */
2083 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 1534 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags);
2084 return; 1536 return;
1537 }
2085 1538
2086 /* 1539 /*
2087 * Force the grace period if too many callbacks or too long waiting. 1540 * Force the grace period if too many callbacks or too long waiting.
@@ -2108,69 +1561,12 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2108 rdp->blimit = LONG_MAX; 1561 rdp->blimit = LONG_MAX;
2109 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1562 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
2110 *rdp->nxttail[RCU_DONE_TAIL] != head) 1563 *rdp->nxttail[RCU_DONE_TAIL] != head)
2111 force_quiescent_state(rsp); 1564 force_quiescent_state(rsp, 0);
2112 rdp->n_force_qs_snap = rsp->n_force_qs; 1565 rdp->n_force_qs_snap = rsp->n_force_qs;
2113 rdp->qlen_last_fqs_check = rdp->qlen; 1566 rdp->qlen_last_fqs_check = rdp->qlen;
2114 } 1567 }
2115 } 1568 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
2116} 1569 force_quiescent_state(rsp, 1);
2117
2118/*
2119 * Helper function for call_rcu() and friends. The cpu argument will
2120 * normally be -1, indicating "currently running CPU". It may specify
2121 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2122 * is expected to specify a CPU.
2123 */
2124static void
2125__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2126 struct rcu_state *rsp, int cpu, bool lazy)
2127{
2128 unsigned long flags;
2129 struct rcu_data *rdp;
2130
2131 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2132 debug_rcu_head_queue(head);
2133 head->func = func;
2134 head->next = NULL;
2135
2136 /*
2137 * Opportunistically note grace-period endings and beginnings.
2138 * Note that we might see a beginning right after we see an
2139 * end, but never vice versa, since this CPU has to pass through
2140 * a quiescent state betweentimes.
2141 */
2142 local_irq_save(flags);
2143 rdp = this_cpu_ptr(rsp->rda);
2144
2145 /* Add the callback to our list. */
2146 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2147 int offline;
2148
2149 if (cpu != -1)
2150 rdp = per_cpu_ptr(rsp->rda, cpu);
2151 offline = !__call_rcu_nocb(rdp, head, lazy);
2152 WARN_ON_ONCE(offline);
2153 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2154 local_irq_restore(flags);
2155 return;
2156 }
2157 ACCESS_ONCE(rdp->qlen)++;
2158 if (lazy)
2159 rdp->qlen_lazy++;
2160 else
2161 rcu_idle_count_callbacks_posted();
2162 smp_mb(); /* Count before adding callback for rcu_barrier(). */
2163 *rdp->nxttail[RCU_NEXT_TAIL] = head;
2164 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
2165
2166 if (__is_kfree_rcu_offset((unsigned long)func))
2167 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
2168 rdp->qlen_lazy, rdp->qlen);
2169 else
2170 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
2171
2172 /* Go handle any RCU core processing required. */
2173 __call_rcu_core(rsp, rdp, head, flags);
2174 local_irq_restore(flags); 1570 local_irq_restore(flags);
2175} 1571}
2176 1572
@@ -2179,39 +1575,19 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2179 */ 1575 */
2180void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1576void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2181{ 1577{
2182 __call_rcu(head, func, &rcu_sched_state, -1, 0); 1578 __call_rcu(head, func, &rcu_sched_state);
2183} 1579}
2184EXPORT_SYMBOL_GPL(call_rcu_sched); 1580EXPORT_SYMBOL_GPL(call_rcu_sched);
2185 1581
2186/* 1582/*
2187 * Queue an RCU callback for invocation after a quicker grace period. 1583 * Queue an RCU for invocation after a quicker grace period.
2188 */ 1584 */
2189void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1585void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2190{ 1586{
2191 __call_rcu(head, func, &rcu_bh_state, -1, 0); 1587 __call_rcu(head, func, &rcu_bh_state);
2192} 1588}
2193EXPORT_SYMBOL_GPL(call_rcu_bh); 1589EXPORT_SYMBOL_GPL(call_rcu_bh);
2194 1590
2195/*
2196 * Because a context switch is a grace period for RCU-sched and RCU-bh,
2197 * any blocking grace-period wait automatically implies a grace period
2198 * if there is only one CPU online at any point time during execution
2199 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
2200 * occasionally incorrectly indicate that there are multiple CPUs online
2201 * when there was in fact only one the whole time, as this just adds
2202 * some overhead: RCU still operates correctly.
2203 */
2204static inline int rcu_blocking_is_gp(void)
2205{
2206 int ret;
2207
2208 might_sleep(); /* Check for RCU read-side critical section. */
2209 preempt_disable();
2210 ret = num_online_cpus() <= 1;
2211 preempt_enable();
2212 return ret;
2213}
2214
2215/** 1591/**
2216 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1592 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
2217 * 1593 *
@@ -2224,28 +1600,10 @@ static inline int rcu_blocking_is_gp(void)
2224 * rcu_read_lock_sched(). 1600 * rcu_read_lock_sched().
2225 * 1601 *
2226 * This means that all preempt_disable code sequences, including NMI and 1602 * This means that all preempt_disable code sequences, including NMI and
2227 * non-threaded hardware-interrupt handlers, in progress on entry will 1603 * hardware-interrupt handlers, in progress on entry will have completed
2228 * have completed before this primitive returns. However, this does not 1604 * before this primitive returns. However, this does not guarantee that
2229 * guarantee that softirq handlers will have completed, since in some 1605 * softirq handlers will have completed, since in some kernels, these
2230 * kernels, these handlers can run in process context, and can block. 1606 * handlers can run in process context, and can block.
2231 *
2232 * Note that this guarantee implies further memory-ordering guarantees.
2233 * On systems with more than one CPU, when synchronize_sched() returns,
2234 * each CPU is guaranteed to have executed a full memory barrier since the
2235 * end of its last RCU-sched read-side critical section whose beginning
2236 * preceded the call to synchronize_sched(). In addition, each CPU having
2237 * an RCU read-side critical section that extends beyond the return from
2238 * synchronize_sched() is guaranteed to have executed a full memory barrier
2239 * after the beginning of synchronize_sched() and before the beginning of
2240 * that RCU read-side critical section. Note that these guarantees include
2241 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2242 * that are executing in the kernel.
2243 *
2244 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2245 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2246 * to have executed a full memory barrier during the execution of
2247 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2248 * again only if the system has more than one CPU).
2249 * 1607 *
2250 * This primitive provides the guarantees made by the (now removed) 1608 * This primitive provides the guarantees made by the (now removed)
2251 * synchronize_kernel() API. In contrast, synchronize_rcu() only 1609 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2255,16 +1613,18 @@ static inline int rcu_blocking_is_gp(void)
2255 */ 1613 */
2256void synchronize_sched(void) 1614void synchronize_sched(void)
2257{ 1615{
2258 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 1616 struct rcu_synchronize rcu;
2259 !lock_is_held(&rcu_lock_map) && 1617
2260 !lock_is_held(&rcu_sched_lock_map),
2261 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2262 if (rcu_blocking_is_gp()) 1618 if (rcu_blocking_is_gp())
2263 return; 1619 return;
2264 if (rcu_expedited) 1620
2265 synchronize_sched_expedited(); 1621 init_rcu_head_on_stack(&rcu.head);
2266 else 1622 init_completion(&rcu.completion);
2267 wait_rcu_gp(call_rcu_sched); 1623 /* Will wake me after RCU finished. */
1624 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1625 /* Wait for it. */
1626 wait_for_completion(&rcu.completion);
1627 destroy_rcu_head_on_stack(&rcu.head);
2268} 1628}
2269EXPORT_SYMBOL_GPL(synchronize_sched); 1629EXPORT_SYMBOL_GPL(synchronize_sched);
2270 1630
@@ -2276,181 +1636,23 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2276 * read-side critical sections have completed. RCU read-side critical 1636 * read-side critical sections have completed. RCU read-side critical
2277 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 1637 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2278 * and may be nested. 1638 * and may be nested.
2279 *
2280 * See the description of synchronize_sched() for more detailed information
2281 * on memory ordering guarantees.
2282 */ 1639 */
2283void synchronize_rcu_bh(void) 1640void synchronize_rcu_bh(void)
2284{ 1641{
2285 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 1642 struct rcu_synchronize rcu;
2286 !lock_is_held(&rcu_lock_map) && 1643
2287 !lock_is_held(&rcu_sched_lock_map),
2288 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2289 if (rcu_blocking_is_gp()) 1644 if (rcu_blocking_is_gp())
2290 return; 1645 return;
2291 if (rcu_expedited)
2292 synchronize_rcu_bh_expedited();
2293 else
2294 wait_rcu_gp(call_rcu_bh);
2295}
2296EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2297 1646
2298static int synchronize_sched_expedited_cpu_stop(void *data) 1647 init_rcu_head_on_stack(&rcu.head);
2299{ 1648 init_completion(&rcu.completion);
2300 /* 1649 /* Will wake me after RCU finished. */
2301 * There must be a full memory barrier on each affected CPU 1650 call_rcu_bh(&rcu.head, wakeme_after_rcu);
2302 * between the time that try_stop_cpus() is called and the 1651 /* Wait for it. */
2303 * time that it returns. 1652 wait_for_completion(&rcu.completion);
2304 * 1653 destroy_rcu_head_on_stack(&rcu.head);
2305 * In the current initial implementation of cpu_stop, the
2306 * above condition is already met when the control reaches
2307 * this point and the following smp_mb() is not strictly
2308 * necessary. Do smp_mb() anyway for documentation and
2309 * robustness against future implementation changes.
2310 */
2311 smp_mb(); /* See above comment block. */
2312 return 0;
2313} 1654}
2314 1655EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2315/**
2316 * synchronize_sched_expedited - Brute-force RCU-sched grace period
2317 *
2318 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
2319 * approach to force the grace period to end quickly. This consumes
2320 * significant time on all CPUs and is unfriendly to real-time workloads,
2321 * so is thus not recommended for any sort of common-case code. In fact,
2322 * if you are using synchronize_sched_expedited() in a loop, please
2323 * restructure your code to batch your updates, and then use a single
2324 * synchronize_sched() instead.
2325 *
2326 * Note that it is illegal to call this function while holding any lock
2327 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
2328 * to call this function from a CPU-hotplug notifier. Failing to observe
2329 * these restriction will result in deadlock.
2330 *
2331 * This implementation can be thought of as an application of ticket
2332 * locking to RCU, with sync_sched_expedited_started and
2333 * sync_sched_expedited_done taking on the roles of the halves
2334 * of the ticket-lock word. Each task atomically increments
2335 * sync_sched_expedited_started upon entry, snapshotting the old value,
2336 * then attempts to stop all the CPUs. If this succeeds, then each
2337 * CPU will have executed a context switch, resulting in an RCU-sched
2338 * grace period. We are then done, so we use atomic_cmpxchg() to
2339 * update sync_sched_expedited_done to match our snapshot -- but
2340 * only if someone else has not already advanced past our snapshot.
2341 *
2342 * On the other hand, if try_stop_cpus() fails, we check the value
2343 * of sync_sched_expedited_done. If it has advanced past our
2344 * initial snapshot, then someone else must have forced a grace period
2345 * some time after we took our snapshot. In this case, our work is
2346 * done for us, and we can simply return. Otherwise, we try again,
2347 * but keep our initial snapshot for purposes of checking for someone
2348 * doing our work for us.
2349 *
2350 * If we fail too many times in a row, we fall back to synchronize_sched().
2351 */
2352void synchronize_sched_expedited(void)
2353{
2354 long firstsnap, s, snap;
2355 int trycount = 0;
2356 struct rcu_state *rsp = &rcu_sched_state;
2357
2358 /*
2359 * If we are in danger of counter wrap, just do synchronize_sched().
2360 * By allowing sync_sched_expedited_started to advance no more than
2361 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2362 * that more than 3.5 billion CPUs would be required to force a
2363 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2364 * course be required on a 64-bit system.
2365 */
2366 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2367 (ulong)atomic_long_read(&rsp->expedited_done) +
2368 ULONG_MAX / 8)) {
2369 synchronize_sched();
2370 atomic_long_inc(&rsp->expedited_wrap);
2371 return;
2372 }
2373
2374 /*
2375 * Take a ticket. Note that atomic_inc_return() implies a
2376 * full memory barrier.
2377 */
2378 snap = atomic_long_inc_return(&rsp->expedited_start);
2379 firstsnap = snap;
2380 get_online_cpus();
2381 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2382
2383 /*
2384 * Each pass through the following loop attempts to force a
2385 * context switch on each CPU.
2386 */
2387 while (try_stop_cpus(cpu_online_mask,
2388 synchronize_sched_expedited_cpu_stop,
2389 NULL) == -EAGAIN) {
2390 put_online_cpus();
2391 atomic_long_inc(&rsp->expedited_tryfail);
2392
2393 /* Check to see if someone else did our work for us. */
2394 s = atomic_long_read(&rsp->expedited_done);
2395 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2396 /* ensure test happens before caller kfree */
2397 smp_mb__before_atomic_inc(); /* ^^^ */
2398 atomic_long_inc(&rsp->expedited_workdone1);
2399 return;
2400 }
2401
2402 /* No joy, try again later. Or just synchronize_sched(). */
2403 if (trycount++ < 10) {
2404 udelay(trycount * num_online_cpus());
2405 } else {
2406 wait_rcu_gp(call_rcu_sched);
2407 atomic_long_inc(&rsp->expedited_normal);
2408 return;
2409 }
2410
2411 /* Recheck to see if someone else did our work for us. */
2412 s = atomic_long_read(&rsp->expedited_done);
2413 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2414 /* ensure test happens before caller kfree */
2415 smp_mb__before_atomic_inc(); /* ^^^ */
2416 atomic_long_inc(&rsp->expedited_workdone2);
2417 return;
2418 }
2419
2420 /*
2421 * Refetching sync_sched_expedited_started allows later
2422 * callers to piggyback on our grace period. We retry
2423 * after they started, so our grace period works for them,
2424 * and they started after our first try, so their grace
2425 * period works for us.
2426 */
2427 get_online_cpus();
2428 snap = atomic_long_read(&rsp->expedited_start);
2429 smp_mb(); /* ensure read is before try_stop_cpus(). */
2430 }
2431 atomic_long_inc(&rsp->expedited_stoppedcpus);
2432
2433 /*
2434 * Everyone up to our most recent fetch is covered by our grace
2435 * period. Update the counter, but only if our work is still
2436 * relevant -- which it won't be if someone who started later
2437 * than we did already did their update.
2438 */
2439 do {
2440 atomic_long_inc(&rsp->expedited_done_tries);
2441 s = atomic_long_read(&rsp->expedited_done);
2442 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2443 /* ensure test happens before caller kfree */
2444 smp_mb__before_atomic_inc(); /* ^^^ */
2445 atomic_long_inc(&rsp->expedited_done_lost);
2446 break;
2447 }
2448 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2449 atomic_long_inc(&rsp->expedited_done_exit);
2450
2451 put_online_cpus();
2452}
2453EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
2454 1656
2455/* 1657/*
2456 * Check to see if there is any immediate RCU-related work to be done 1658 * Check to see if there is any immediate RCU-related work to be done
@@ -2469,10 +1671,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2469 check_cpu_stall(rsp, rdp); 1671 check_cpu_stall(rsp, rdp);
2470 1672
2471 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1673 /* Is the RCU core waiting for a quiescent state from this CPU? */
2472 if (rcu_scheduler_fully_active && 1674 if (rdp->qs_pending && !rdp->passed_quiesc) {
2473 rdp->qs_pending && !rdp->passed_quiesce) { 1675
1676 /*
1677 * If force_quiescent_state() coming soon and this CPU
1678 * needs a quiescent state, and this is either RCU-sched
1679 * or RCU-bh, force a local reschedule.
1680 */
2474 rdp->n_rp_qs_pending++; 1681 rdp->n_rp_qs_pending++;
2475 } else if (rdp->qs_pending && rdp->passed_quiesce) { 1682 if (!rdp->preemptible &&
1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1684 jiffies))
1685 set_need_resched();
1686 } else if (rdp->qs_pending && rdp->passed_quiesc) {
2476 rdp->n_rp_report_qs++; 1687 rdp->n_rp_report_qs++;
2477 return 1; 1688 return 1;
2478 } 1689 }
@@ -2501,6 +1712,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2501 return 1; 1712 return 1;
2502 } 1713 }
2503 1714
1715 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1716 if (rcu_gp_in_progress(rsp) &&
1717 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1718 rdp->n_rp_need_fqs++;
1719 return 1;
1720 }
1721
2504 /* nothing to do */ 1722 /* nothing to do */
2505 rdp->n_rp_need_nothing++; 1723 rdp->n_rp_need_nothing++;
2506 return 0; 1724 return 0;
@@ -2513,12 +1731,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2513 */ 1731 */
2514static int rcu_pending(int cpu) 1732static int rcu_pending(int cpu)
2515{ 1733{
2516 struct rcu_state *rsp; 1734 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
2517 1735 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
2518 for_each_rcu_flavor(rsp) 1736 rcu_preempt_pending(cpu);
2519 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
2520 return 1;
2521 return 0;
2522} 1737}
2523 1738
2524/* 1739/*
@@ -2526,43 +1741,23 @@ static int rcu_pending(int cpu)
2526 * by the current CPU, even if none need be done immediately, returning 1741 * by the current CPU, even if none need be done immediately, returning
2527 * 1 if so. 1742 * 1 if so.
2528 */ 1743 */
2529static int rcu_cpu_has_callbacks(int cpu) 1744static int rcu_needs_cpu_quick_check(int cpu)
2530{ 1745{
2531 struct rcu_state *rsp;
2532
2533 /* RCU callbacks either ready or pending? */ 1746 /* RCU callbacks either ready or pending? */
2534 for_each_rcu_flavor(rsp) 1747 return per_cpu(rcu_sched_data, cpu).nxtlist ||
2535 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) 1748 per_cpu(rcu_bh_data, cpu).nxtlist ||
2536 return 1; 1749 rcu_preempt_needs_cpu(cpu);
2537 return 0;
2538} 1750}
2539 1751
2540/* 1752static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
2541 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 1753static atomic_t rcu_barrier_cpu_count;
2542 * the compiler is expected to optimize this away. 1754static DEFINE_MUTEX(rcu_barrier_mutex);
2543 */ 1755static struct completion rcu_barrier_completion;
2544static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
2545 int cpu, unsigned long done)
2546{
2547 trace_rcu_barrier(rsp->name, s, cpu,
2548 atomic_read(&rsp->barrier_cpu_count), done);
2549}
2550 1756
2551/* 1757static void rcu_barrier_callback(struct rcu_head *notused)
2552 * RCU callback function for _rcu_barrier(). If we are last, wake
2553 * up the task executing _rcu_barrier().
2554 */
2555static void rcu_barrier_callback(struct rcu_head *rhp)
2556{ 1758{
2557 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); 1759 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2558 struct rcu_state *rsp = rdp->rsp; 1760 complete(&rcu_barrier_completion);
2559
2560 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
2561 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
2562 complete(&rsp->barrier_completion);
2563 } else {
2564 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
2565 }
2566} 1761}
2567 1762
2568/* 1763/*
@@ -2570,116 +1765,45 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
2570 */ 1765 */
2571static void rcu_barrier_func(void *type) 1766static void rcu_barrier_func(void *type)
2572{ 1767{
2573 struct rcu_state *rsp = type; 1768 int cpu = smp_processor_id();
2574 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1769 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1770 void (*call_rcu_func)(struct rcu_head *head,
1771 void (*func)(struct rcu_head *head));
2575 1772
2576 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 1773 atomic_inc(&rcu_barrier_cpu_count);
2577 atomic_inc(&rsp->barrier_cpu_count); 1774 call_rcu_func = type;
2578 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 1775 call_rcu_func(head, rcu_barrier_callback);
2579} 1776}
2580 1777
2581/* 1778/*
2582 * Orchestrate the specified type of RCU barrier, waiting for all 1779 * Orchestrate the specified type of RCU barrier, waiting for all
2583 * RCU callbacks of the specified type to complete. 1780 * RCU callbacks of the specified type to complete.
2584 */ 1781 */
2585static void _rcu_barrier(struct rcu_state *rsp) 1782static void _rcu_barrier(struct rcu_state *rsp,
1783 void (*call_rcu_func)(struct rcu_head *head,
1784 void (*func)(struct rcu_head *head)))
2586{ 1785{
2587 int cpu; 1786 BUG_ON(in_interrupt());
2588 struct rcu_data *rdp;
2589 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2590 unsigned long snap_done;
2591
2592 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2593
2594 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 1787 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2595 mutex_lock(&rsp->barrier_mutex); 1788 mutex_lock(&rcu_barrier_mutex);
2596 1789 init_completion(&rcu_barrier_completion);
2597 /* 1790 /*
2598 * Ensure that all prior references, including to ->n_barrier_done, 1791 * Initialize rcu_barrier_cpu_count to 1, then invoke
2599 * are ordered before the _rcu_barrier() machinery. 1792 * rcu_barrier_func() on each CPU, so that each CPU also has
1793 * incremented rcu_barrier_cpu_count. Only then is it safe to
1794 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1795 * might complete its grace period before all of the other CPUs
1796 * did their increment, causing this function to return too
1797 * early. Note that on_each_cpu() disables irqs, which prevents
1798 * any CPUs from coming online or going offline until each online
1799 * CPU has queued its RCU-barrier callback.
2600 */ 1800 */
2601 smp_mb(); /* See above block comment. */ 1801 atomic_set(&rcu_barrier_cpu_count, 1);
2602 1802 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
2603 /* 1803 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2604 * Recheck ->n_barrier_done to see if others did our work for us. 1804 complete(&rcu_barrier_completion);
2605 * This means checking ->n_barrier_done for an even-to-odd-to-even 1805 wait_for_completion(&rcu_barrier_completion);
2606 * transition. The "if" expression below therefore rounds the old 1806 mutex_unlock(&rcu_barrier_mutex);
2607 * value up to the next even number and adds two before comparing.
2608 */
2609 snap_done = ACCESS_ONCE(rsp->n_barrier_done);
2610 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2611 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
2612 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2613 smp_mb(); /* caller's subsequent code after above check. */
2614 mutex_unlock(&rsp->barrier_mutex);
2615 return;
2616 }
2617
2618 /*
2619 * Increment ->n_barrier_done to avoid duplicate work. Use
2620 * ACCESS_ONCE() to prevent the compiler from speculating
2621 * the increment to precede the early-exit check.
2622 */
2623 ACCESS_ONCE(rsp->n_barrier_done)++;
2624 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
2625 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
2626 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
2627
2628 /*
2629 * Initialize the count to one rather than to zero in order to
2630 * avoid a too-soon return to zero in case of a short grace period
2631 * (or preemption of this task). Exclude CPU-hotplug operations
2632 * to ensure that no offline CPU has callbacks queued.
2633 */
2634 init_completion(&rsp->barrier_completion);
2635 atomic_set(&rsp->barrier_cpu_count, 1);
2636 get_online_cpus();
2637
2638 /*
2639 * Force each CPU with callbacks to register a new callback.
2640 * When that callback is invoked, we will know that all of the
2641 * corresponding CPU's preceding callbacks have been invoked.
2642 */
2643 for_each_possible_cpu(cpu) {
2644 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2645 continue;
2646 rdp = per_cpu_ptr(rsp->rda, cpu);
2647 if (is_nocb_cpu(cpu)) {
2648 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2649 rsp->n_barrier_done);
2650 atomic_inc(&rsp->barrier_cpu_count);
2651 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2652 rsp, cpu, 0);
2653 } else if (ACCESS_ONCE(rdp->qlen)) {
2654 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2655 rsp->n_barrier_done);
2656 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2657 } else {
2658 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2659 rsp->n_barrier_done);
2660 }
2661 }
2662 put_online_cpus();
2663
2664 /*
2665 * Now that we have an rcu_barrier_callback() callback on each
2666 * CPU, and thus each counted, remove the initial count.
2667 */
2668 if (atomic_dec_and_test(&rsp->barrier_cpu_count))
2669 complete(&rsp->barrier_completion);
2670
2671 /* Increment ->n_barrier_done to prevent duplicate work. */
2672 smp_mb(); /* Keep increment after above mechanism. */
2673 ACCESS_ONCE(rsp->n_barrier_done)++;
2674 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
2675 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
2676 smp_mb(); /* Keep increment before caller's subsequent code. */
2677
2678 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2679 wait_for_completion(&rsp->barrier_completion);
2680
2681 /* Other rcu_barrier() invocations can now safely proceed. */
2682 mutex_unlock(&rsp->barrier_mutex);
2683} 1807}
2684 1808
2685/** 1809/**
@@ -2687,7 +1811,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
2687 */ 1811 */
2688void rcu_barrier_bh(void) 1812void rcu_barrier_bh(void)
2689{ 1813{
2690 _rcu_barrier(&rcu_bh_state); 1814 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
2691} 1815}
2692EXPORT_SYMBOL_GPL(rcu_barrier_bh); 1816EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2693 1817
@@ -2696,7 +1820,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2696 */ 1820 */
2697void rcu_barrier_sched(void) 1821void rcu_barrier_sched(void)
2698{ 1822{
2699 _rcu_barrier(&rcu_sched_state); 1823 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
2700} 1824}
2701EXPORT_SYMBOL_GPL(rcu_barrier_sched); 1825EXPORT_SYMBOL_GPL(rcu_barrier_sched);
2702 1826
@@ -2707,24 +1831,21 @@ static void __init
2707rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 1831rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2708{ 1832{
2709 unsigned long flags; 1833 unsigned long flags;
1834 int i;
2710 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1835 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2711 struct rcu_node *rnp = rcu_get_root(rsp); 1836 struct rcu_node *rnp = rcu_get_root(rsp);
2712 1837
2713 /* Set up local state, ensuring consistent view of global state. */ 1838 /* Set up local state, ensuring consistent view of global state. */
2714 raw_spin_lock_irqsave(&rnp->lock, flags); 1839 raw_spin_lock_irqsave(&rnp->lock, flags);
2715 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1840 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
2716 init_callback_list(rdp); 1841 rdp->nxtlist = NULL;
2717 rdp->qlen_lazy = 0; 1842 for (i = 0; i < RCU_NEXT_SIZE; i++)
2718 ACCESS_ONCE(rdp->qlen) = 0; 1843 rdp->nxttail[i] = &rdp->nxtlist;
1844 rdp->qlen = 0;
1845#ifdef CONFIG_NO_HZ
2719 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1846 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2720 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 1847#endif /* #ifdef CONFIG_NO_HZ */
2721 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722#ifdef CONFIG_RCU_USER_QS
2723 WARN_ON_ONCE(rdp->dynticks->in_user);
2724#endif
2725 rdp->cpu = cpu; 1848 rdp->cpu = cpu;
2726 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp);
2728 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1849 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2729} 1850}
2730 1851
@@ -2742,23 +1863,25 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2742 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1863 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2743 struct rcu_node *rnp = rcu_get_root(rsp); 1864 struct rcu_node *rnp = rcu_get_root(rsp);
2744 1865
2745 /* Exclude new grace periods. */
2746 mutex_lock(&rsp->onoff_mutex);
2747
2748 /* Set up local state, ensuring consistent view of global state. */ 1866 /* Set up local state, ensuring consistent view of global state. */
2749 raw_spin_lock_irqsave(&rnp->lock, flags); 1867 raw_spin_lock_irqsave(&rnp->lock, flags);
1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
2750 rdp->beenonline = 1; /* We have now been online. */ 1870 rdp->beenonline = 1; /* We have now been online. */
2751 rdp->preemptible = preemptible; 1871 rdp->preemptible = preemptible;
2752 rdp->qlen_last_fqs_check = 0; 1872 rdp->qlen_last_fqs_check = 0;
2753 rdp->n_force_qs_snap = rsp->n_force_qs; 1873 rdp->n_force_qs_snap = rsp->n_force_qs;
2754 rdp->blimit = blimit; 1874 rdp->blimit = blimit;
2755 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2756 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2757 atomic_set(&rdp->dynticks->dynticks,
2758 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2759 rcu_prepare_for_idle_init(cpu);
2760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1875 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2761 1876
1877 /*
1878 * A new grace period might start here. If so, we won't be part
1879 * of it, but that is OK, as we are currently in a quiescent state.
1880 */
1881
1882 /* Exclude any attempts to start a new GP on large systems. */
1883 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1884
2762 /* Add CPU to rcu_node bitmasks. */ 1885 /* Add CPU to rcu_node bitmasks. */
2763 rnp = rdp->mynode; 1886 rnp = rdp->mynode;
2764 mask = rdp->grpmask; 1887 mask = rdp->grpmask;
@@ -2768,32 +1891,22 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2768 rnp->qsmaskinit |= mask; 1891 rnp->qsmaskinit |= mask;
2769 mask = rnp->grpmask; 1892 mask = rnp->grpmask;
2770 if (rnp == rdp->mynode) { 1893 if (rnp == rdp->mynode) {
2771 /* 1894 rdp->gpnum = rnp->completed; /* if GP in progress... */
2772 * If there is a grace period in progress, we will
2773 * set up to wait for it next time we run the
2774 * RCU core code.
2775 */
2776 rdp->gpnum = rnp->completed;
2777 rdp->completed = rnp->completed; 1895 rdp->completed = rnp->completed;
2778 rdp->passed_quiesce = 0; 1896 rdp->passed_quiesc_completed = rnp->completed - 1;
2779 rdp->qs_pending = 0;
2780 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
2781 } 1897 }
2782 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 1898 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2783 rnp = rnp->parent; 1899 rnp = rnp->parent;
2784 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1900 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
2785 local_irq_restore(flags);
2786 1901
2787 mutex_unlock(&rsp->onoff_mutex); 1902 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2788} 1903}
2789 1904
2790static void __cpuinit rcu_prepare_cpu(int cpu) 1905static void __cpuinit rcu_prepare_cpu(int cpu)
2791{ 1906{
2792 struct rcu_state *rsp; 1907 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
2793 1908 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
2794 for_each_rcu_flavor(rsp) 1909 rcu_preempt_init_percpu_data(cpu);
2795 rcu_init_percpu_data(cpu, rsp,
2796 strcmp(rsp->name, "rcu_preempt") == 0);
2797} 1910}
2798 1911
2799/* 1912/*
@@ -2805,10 +1918,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2805 long cpu = (long)hcpu; 1918 long cpu = (long)hcpu;
2806 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2807 struct rcu_node *rnp = rdp->mynode; 1920 struct rcu_node *rnp = rdp->mynode;
2808 struct rcu_state *rsp;
2809 int ret = NOTIFY_OK;
2810 1921
2811 trace_rcu_utilization("Start CPU hotplug");
2812 switch (action) { 1922 switch (action) {
2813 case CPU_UP_PREPARE: 1923 case CPU_UP_PREPARE:
2814 case CPU_UP_PREPARE_FROZEN: 1924 case CPU_UP_PREPARE_FROZEN:
@@ -2817,13 +1927,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2817 break; 1927 break;
2818 case CPU_ONLINE: 1928 case CPU_ONLINE:
2819 case CPU_DOWN_FAILED: 1929 case CPU_DOWN_FAILED:
2820 rcu_boost_kthread_setaffinity(rnp, -1); 1930 rcu_node_kthread_setaffinity(rnp, -1);
1931 rcu_cpu_kthread_setrt(cpu, 1);
2821 break; 1932 break;
2822 case CPU_DOWN_PREPARE: 1933 case CPU_DOWN_PREPARE:
2823 if (nocb_cpu_expendable(cpu)) 1934 rcu_node_kthread_setaffinity(rnp, cpu);
2824 rcu_boost_kthread_setaffinity(rnp, cpu); 1935 rcu_cpu_kthread_setrt(cpu, 0);
2825 else
2826 ret = NOTIFY_BAD;
2827 break; 1936 break;
2828 case CPU_DYING: 1937 case CPU_DYING:
2829 case CPU_DYING_FROZEN: 1938 case CPU_DYING_FROZEN:
@@ -2832,46 +1941,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2832 * touch any data without introducing corruption. We send the 1941 * touch any data without introducing corruption. We send the
2833 * dying CPU's callbacks to an arbitrarily chosen online CPU. 1942 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2834 */ 1943 */
2835 for_each_rcu_flavor(rsp) 1944 rcu_send_cbs_to_online(&rcu_bh_state);
2836 rcu_cleanup_dying_cpu(rsp); 1945 rcu_send_cbs_to_online(&rcu_sched_state);
2837 rcu_cleanup_after_idle(cpu); 1946 rcu_preempt_send_cbs_to_online();
2838 break; 1947 break;
2839 case CPU_DEAD: 1948 case CPU_DEAD:
2840 case CPU_DEAD_FROZEN: 1949 case CPU_DEAD_FROZEN:
2841 case CPU_UP_CANCELED: 1950 case CPU_UP_CANCELED:
2842 case CPU_UP_CANCELED_FROZEN: 1951 case CPU_UP_CANCELED_FROZEN:
2843 for_each_rcu_flavor(rsp) 1952 rcu_offline_cpu(cpu);
2844 rcu_cleanup_dead_cpu(cpu, rsp);
2845 break; 1953 break;
2846 default: 1954 default:
2847 break; 1955 break;
2848 } 1956 }
2849 trace_rcu_utilization("End CPU hotplug"); 1957 return NOTIFY_OK;
2850 return ret;
2851}
2852
2853/*
2854 * Spawn the kthread that handles this RCU flavor's grace periods.
2855 */
2856static int __init rcu_spawn_gp_kthread(void)
2857{
2858 unsigned long flags;
2859 struct rcu_node *rnp;
2860 struct rcu_state *rsp;
2861 struct task_struct *t;
2862
2863 for_each_rcu_flavor(rsp) {
2864 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
2865 BUG_ON(IS_ERR(t));
2866 rnp = rcu_get_root(rsp);
2867 raw_spin_lock_irqsave(&rnp->lock, flags);
2868 rsp->gp_kthread = t;
2869 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2870 rcu_spawn_nocb_kthreads(rsp);
2871 }
2872 return 0;
2873} 1958}
2874early_initcall(rcu_spawn_gp_kthread);
2875 1959
2876/* 1960/*
2877 * This function is invoked towards the end of the scheduler's initialization 1961 * This function is invoked towards the end of the scheduler's initialization
@@ -2897,9 +1981,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2897{ 1981{
2898 int i; 1982 int i;
2899 1983
2900 for (i = rcu_num_lvls - 1; i > 0; i--) 1984 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2901 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1985 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2902 rsp->levelspread[0] = rcu_fanout_leaf; 1986 rsp->levelspread[0] = RCU_FANOUT_LEAF;
2903} 1987}
2904#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1988#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2905static void __init rcu_init_levelspread(struct rcu_state *rsp) 1989static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2908,8 +1992,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2908 int cprv; 1992 int cprv;
2909 int i; 1993 int i;
2910 1994
2911 cprv = nr_cpu_ids; 1995 cprv = NR_CPUS;
2912 for (i = rcu_num_lvls - 1; i >= 0; i--) { 1996 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
2913 ccur = rsp->levelcnt[i]; 1997 ccur = rsp->levelcnt[i];
2914 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 1998 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
2915 cprv = ccur; 1999 cprv = ccur;
@@ -2923,14 +2007,10 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2923static void __init rcu_init_one(struct rcu_state *rsp, 2007static void __init rcu_init_one(struct rcu_state *rsp,
2924 struct rcu_data __percpu *rda) 2008 struct rcu_data __percpu *rda)
2925{ 2009{
2926 static char *buf[] = { "rcu_node_0", 2010 static char *buf[] = { "rcu_node_level_0",
2927 "rcu_node_1", 2011 "rcu_node_level_1",
2928 "rcu_node_2", 2012 "rcu_node_level_2",
2929 "rcu_node_3" }; /* Match MAX_RCU_LVLS */ 2013 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
2930 static char *fqs[] = { "rcu_node_fqs_0",
2931 "rcu_node_fqs_1",
2932 "rcu_node_fqs_2",
2933 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
2934 int cpustride = 1; 2014 int cpustride = 1;
2935 int i; 2015 int i;
2936 int j; 2016 int j;
@@ -2940,26 +2020,20 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2940 2020
2941 /* Initialize the level-tracking arrays. */ 2021 /* Initialize the level-tracking arrays. */
2942 2022
2943 for (i = 0; i < rcu_num_lvls; i++) 2023 for (i = 1; i < NUM_RCU_LVLS; i++)
2944 rsp->levelcnt[i] = num_rcu_lvl[i];
2945 for (i = 1; i < rcu_num_lvls; i++)
2946 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 2024 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
2947 rcu_init_levelspread(rsp); 2025 rcu_init_levelspread(rsp);
2948 2026
2949 /* Initialize the elements themselves, starting from the leaves. */ 2027 /* Initialize the elements themselves, starting from the leaves. */
2950 2028
2951 for (i = rcu_num_lvls - 1; i >= 0; i--) { 2029 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
2952 cpustride *= rsp->levelspread[i]; 2030 cpustride *= rsp->levelspread[i];
2953 rnp = rsp->level[i]; 2031 rnp = rsp->level[i];
2954 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 2032 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
2955 raw_spin_lock_init(&rnp->lock); 2033 raw_spin_lock_init(&rnp->lock);
2956 lockdep_set_class_and_name(&rnp->lock, 2034 lockdep_set_class_and_name(&rnp->lock,
2957 &rcu_node_class[i], buf[i]); 2035 &rcu_node_class[i], buf[i]);
2958 raw_spin_lock_init(&rnp->fqslock); 2036 rnp->gpnum = 0;
2959 lockdep_set_class_and_name(&rnp->fqslock,
2960 &rcu_fqs_class[i], fqs[i]);
2961 rnp->gpnum = rsp->gpnum;
2962 rnp->completed = rsp->completed;
2963 rnp->qsmask = 0; 2037 rnp->qsmask = 0;
2964 rnp->qsmaskinit = 0; 2038 rnp->qsmaskinit = 0;
2965 rnp->grplo = j * cpustride; 2039 rnp->grplo = j * cpustride;
@@ -2982,76 +2056,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2982 } 2056 }
2983 2057
2984 rsp->rda = rda; 2058 rsp->rda = rda;
2985 init_waitqueue_head(&rsp->gp_wq); 2059 rnp = rsp->level[NUM_RCU_LVLS - 1];
2986 rnp = rsp->level[rcu_num_lvls - 1];
2987 for_each_possible_cpu(i) { 2060 for_each_possible_cpu(i) {
2988 while (i > rnp->grphi) 2061 while (i > rnp->grphi)
2989 rnp++; 2062 rnp++;
2990 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 2063 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
2991 rcu_boot_init_percpu_data(i, rsp); 2064 rcu_boot_init_percpu_data(i, rsp);
2992 } 2065 }
2993 list_add(&rsp->flavors, &rcu_struct_flavors);
2994}
2995
2996/*
2997 * Compute the rcu_node tree geometry from kernel parameters. This cannot
2998 * replace the definitions in rcutree.h because those are needed to size
2999 * the ->node array in the rcu_state structure.
3000 */
3001static void __init rcu_init_geometry(void)
3002{
3003 int i;
3004 int j;
3005 int n = nr_cpu_ids;
3006 int rcu_capacity[MAX_RCU_LVLS + 1];
3007
3008 /* If the compile-time values are accurate, just leave. */
3009 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3010 nr_cpu_ids == NR_CPUS)
3011 return;
3012
3013 /*
3014 * Compute number of nodes that can be handled an rcu_node tree
3015 * with the given number of levels. Setting rcu_capacity[0] makes
3016 * some of the arithmetic easier.
3017 */
3018 rcu_capacity[0] = 1;
3019 rcu_capacity[1] = rcu_fanout_leaf;
3020 for (i = 2; i <= MAX_RCU_LVLS; i++)
3021 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
3022
3023 /*
3024 * The boot-time rcu_fanout_leaf parameter is only permitted
3025 * to increase the leaf-level fanout, not decrease it. Of course,
3026 * the leaf-level fanout cannot exceed the number of bits in
3027 * the rcu_node masks. Finally, the tree must be able to accommodate
3028 * the configured number of CPUs. Complain and fall back to the
3029 * compile-time values if these limits are exceeded.
3030 */
3031 if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
3032 rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
3033 n > rcu_capacity[MAX_RCU_LVLS]) {
3034 WARN_ON(1);
3035 return;
3036 }
3037
3038 /* Calculate the number of rcu_nodes at each level of the tree. */
3039 for (i = 1; i <= MAX_RCU_LVLS; i++)
3040 if (n <= rcu_capacity[i]) {
3041 for (j = 0; j <= i; j++)
3042 num_rcu_lvl[j] =
3043 DIV_ROUND_UP(n, rcu_capacity[i - j]);
3044 rcu_num_lvls = i;
3045 for (j = i + 1; j <= MAX_RCU_LVLS; j++)
3046 num_rcu_lvl[j] = 0;
3047 break;
3048 }
3049
3050 /* Calculate the total number of rcu_node structures. */
3051 rcu_num_nodes = 0;
3052 for (i = 0; i <= MAX_RCU_LVLS; i++)
3053 rcu_num_nodes += num_rcu_lvl[i];
3054 rcu_num_nodes -= n;
3055} 2066}
3056 2067
3057void __init rcu_init(void) 2068void __init rcu_init(void)
@@ -3059,11 +2070,9 @@ void __init rcu_init(void)
3059 int cpu; 2070 int cpu;
3060 2071
3061 rcu_bootup_announce(); 2072 rcu_bootup_announce();
3062 rcu_init_geometry();
3063 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2073 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3064 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2074 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3065 __rcu_init_preempt(); 2075 __rcu_init_preempt();
3066 rcu_init_nocb();
3067 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 2076 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3068 2077
3069 /* 2078 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093..01b2ccda26f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,41 +29,45 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * CONFIG_RCU_FANOUT_LEAF.
34 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
35 * In practice, this did work well going from three levels to four. 34 * In practice, this did work well going from three levels to four.
36 * Of course, your mileage may vary. 35 * Of course, your mileage may vary.
37 */ 36 */
38#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
43 47
44#if NR_CPUS <= RCU_FANOUT_1 48#if NR_CPUS <= RCU_FANOUT_1
45# define RCU_NUM_LVLS 1 49# define NUM_RCU_LVLS 1
46# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
47# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
48# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
49# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
50# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
51#elif NR_CPUS <= RCU_FANOUT_2 55#elif NR_CPUS <= RCU_FANOUT_2
52# define RCU_NUM_LVLS 2 56# define NUM_RCU_LVLS 2
53# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
55# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
56# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
57# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
58#elif NR_CPUS <= RCU_FANOUT_3 62#elif NR_CPUS <= RCU_FANOUT_3
59# define RCU_NUM_LVLS 3 63# define NUM_RCU_LVLS 3
60# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
63# define NUM_RCU_LVL_3 (NR_CPUS) 67# define NUM_RCU_LVL_3 (NR_CPUS)
64# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
65#elif NR_CPUS <= RCU_FANOUT_4 69#elif NR_CPUS <= RCU_FANOUT_4
66# define RCU_NUM_LVLS 4 70# define NUM_RCU_LVLS 4
67# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,36 +80,13 @@
76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
78 82
79extern int rcu_num_lvls;
80extern int rcu_num_nodes;
81
82/* 83/*
83 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
84 */ 85 */
85struct rcu_dynticks { 86struct rcu_dynticks {
86 long long dynticks_nesting; /* Track irq/process nesting level. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
87 /* Process level is worth LLONG_MAX/2. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
89 atomic_t dynticks; /* Even value for idle, else odd. */
90#ifdef CONFIG_RCU_FAST_NO_HZ
91 int dyntick_drain; /* Prepare-for-idle state variable. */
92 unsigned long dyntick_holdoff;
93 /* No retries for the jiffy of failure. */
94 struct timer_list idle_gp_timer;
95 /* Wake up CPU sleeping with callbacks. */
96 unsigned long idle_gp_timer_expires;
97 /* When to wake up CPU (for repost). */
98 bool idle_first_pass; /* First pass of attempt to go idle? */
99 unsigned long nonlazy_posted;
100 /* # times non-lazy CBs posted to CPU. */
101 unsigned long nonlazy_posted_snap;
102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
109}; 90};
110 91
111/* RCU's kthread states for tracing. */ 92/* RCU's kthread states for tracing. */
@@ -200,7 +181,12 @@ struct rcu_node {
200 /* Refused to boost: not sure why, though. */ 181 /* Refused to boost: not sure why, though. */
201 /* This can happen due to race conditions. */ 182 /* This can happen due to race conditions. */
202#endif /* #ifdef CONFIG_RCU_BOOST */ 183#endif /* #ifdef CONFIG_RCU_BOOST */
203 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 184 struct task_struct *node_kthread_task;
185 /* kthread that takes care of this rcu_node */
186 /* structure, for example, awakening the */
187 /* per-CPU kthreads as needed. */
188 unsigned int node_kthread_status;
189 /* State of node_kthread_task for tracing. */
204} ____cacheline_internodealigned_in_smp; 190} ____cacheline_internodealigned_in_smp;
205 191
206/* 192/*
@@ -209,7 +195,7 @@ struct rcu_node {
209 */ 195 */
210#define rcu_for_each_node_breadth_first(rsp, rnp) \ 196#define rcu_for_each_node_breadth_first(rsp, rnp) \
211 for ((rnp) = &(rsp)->node[0]; \ 197 for ((rnp) = &(rsp)->node[0]; \
212 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) 198 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
213 199
214/* 200/*
215 * Do a breadth-first scan of the non-leaf rcu_node structures for the 201 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -218,7 +204,7 @@ struct rcu_node {
218 */ 204 */
219#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ 205#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
220 for ((rnp) = &(rsp)->node[0]; \ 206 for ((rnp) = &(rsp)->node[0]; \
221 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) 207 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
222 208
223/* 209/*
224 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state 210 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -227,8 +213,8 @@ struct rcu_node {
227 * It is still a leaf node, even if it is also the root node. 213 * It is still a leaf node, even if it is also the root node.
228 */ 214 */
229#define rcu_for_each_leaf_node(rsp, rnp) \ 215#define rcu_for_each_leaf_node(rsp, rnp) \
230 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ 216 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
231 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) 217 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
232 218
233/* Index values for nxttail array in struct rcu_data. */ 219/* Index values for nxttail array in struct rcu_data. */
234#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 220#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -244,18 +230,14 @@ struct rcu_data {
244 /* in order to detect GP end. */ 230 /* in order to detect GP end. */
245 unsigned long gpnum; /* Highest gp number that this CPU */ 231 unsigned long gpnum; /* Highest gp number that this CPU */
246 /* is aware of having started. */ 232 /* is aware of having started. */
247 bool passed_quiesce; /* User-mode/idle loop etc. */ 233 unsigned long passed_quiesc_completed;
234 /* Value of completed at time of qs. */
235 bool passed_quiesc; /* User-mode/idle loop etc. */
248 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
249 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
250 bool preemptible; /* Preemptible RCU? */ 238 bool preemptible; /* Preemptible RCU? */
251 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 239 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
252 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 240 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
253#ifdef CONFIG_RCU_CPU_STALL_INFO
254 unsigned long ticks_this_gp; /* The number of scheduling-clock */
255 /* ticks this CPU has handled */
256 /* during and after the last grace */
257 /* period it is aware of. */
258#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
259 241
260 /* 2) batch handling */ 242 /* 2) batch handling */
261 /* 243 /*
@@ -282,25 +264,28 @@ struct rcu_data {
282 */ 264 */
283 struct rcu_head *nxtlist; 265 struct rcu_head *nxtlist;
284 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 266 struct rcu_head **nxttail[RCU_NEXT_SIZE];
285 long qlen_lazy; /* # of lazy queued callbacks */ 267 long qlen; /* # of queued callbacks */
286 long qlen; /* # of queued callbacks, incl lazy */
287 long qlen_last_fqs_check; 268 long qlen_last_fqs_check;
288 /* qlen at last check for QS forcing */ 269 /* qlen at last check for QS forcing */
289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 270 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
290 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
291 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 271 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
292 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 272 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
293 unsigned long n_force_qs_snap; 273 unsigned long n_force_qs_snap;
294 /* did other CPU force QS recently? */ 274 /* did other CPU force QS recently? */
295 long blimit; /* Upper limit on a processed batch */ 275 long blimit; /* Upper limit on a processed batch */
296 276
277#ifdef CONFIG_NO_HZ
297 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
298 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
299 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
300 282
301 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
302 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
303 unsigned long offline_fqs; /* Kicked due to being offline. */ 287 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */
304 289
305 /* 5) __rcu_pending() statistics. */ 290 /* 5) __rcu_pending() statistics. */
306 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 291 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -310,36 +295,22 @@ struct rcu_data {
310 unsigned long n_rp_cpu_needs_gp; 295 unsigned long n_rp_cpu_needs_gp;
311 unsigned long n_rp_gp_completed; 296 unsigned long n_rp_gp_completed;
312 unsigned long n_rp_gp_started; 297 unsigned long n_rp_gp_started;
298 unsigned long n_rp_need_fqs;
313 unsigned long n_rp_need_nothing; 299 unsigned long n_rp_need_nothing;
314 300
315 /* 6) _rcu_barrier() and OOM callbacks. */
316 struct rcu_head barrier_head;
317#ifdef CONFIG_RCU_FAST_NO_HZ
318 struct rcu_head oom_head;
319#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
320
321 /* 7) Callback offloading. */
322#ifdef CONFIG_RCU_NOCB_CPU
323 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
324 struct rcu_head **nocb_tail;
325 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
326 atomic_long_t nocb_q_count_lazy; /* (approximate). */
327 int nocb_p_count; /* # CBs being invoked by kthread */
328 int nocb_p_count_lazy; /* (approximate). */
329 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
330 struct task_struct *nocb_kthread;
331#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
332
333 int cpu; 301 int cpu;
334 struct rcu_state *rsp;
335}; 302};
336 303
337/* Values for fqs_state field in struct rcu_state. */ 304/* Values for signaled field in struct rcu_state. */
338#define RCU_GP_IDLE 0 /* No grace period in progress. */ 305#define RCU_GP_IDLE 0 /* No grace period in progress. */
339#define RCU_GP_INIT 1 /* Grace period being initialized. */ 306#define RCU_GP_INIT 1 /* Grace period being initialized. */
340#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 307#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
341#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 308#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
309#ifdef CONFIG_NO_HZ
342#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 310#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
311#else /* #ifdef CONFIG_NO_HZ */
312#define RCU_SIGNAL_INIT RCU_FORCE_QS
313#endif /* #else #ifdef CONFIG_NO_HZ */
343 314
344#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 315#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
345 316
@@ -348,6 +319,12 @@ struct rcu_data {
348#else 319#else
349#define RCU_STALL_DELAY_DELTA 0 320#define RCU_STALL_DELAY_DELTA 0
350#endif 321#endif
322
323#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
324 RCU_STALL_DELAY_DELTA)
325 /* for rsp->jiffies_stall */
326#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
327 /* for rsp->jiffies_stall */
351#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 328#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
352 /* to take at least one */ 329 /* to take at least one */
353 /* scheduling clock irq */ 330 /* scheduling clock irq */
@@ -376,65 +353,32 @@ do { \
376 */ 353 */
377struct rcu_state { 354struct rcu_state {
378 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ 355 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
379 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 356 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
380 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 357 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
381 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 358 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
382 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 359 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
383 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
384 void (*func)(struct rcu_head *head));
385#ifdef CONFIG_RCU_NOCB_CPU
386 void (*call_remote)(struct rcu_head *head,
387 void (*func)(struct rcu_head *head));
388 /* call_rcu() flavor, but for */
389 /* placing on remote CPU. */
390#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
391 360
392 /* The following fields are guarded by the root rcu_node's lock. */ 361 /* The following fields are guarded by the root rcu_node's lock. */
393 362
394 u8 fqs_state ____cacheline_internodealigned_in_smp; 363 u8 signaled ____cacheline_internodealigned_in_smp;
395 /* Force QS state. */ 364 /* Force QS state. */
365 u8 fqs_active; /* force_quiescent_state() */
366 /* is running. */
367 u8 fqs_need_gp; /* A CPU was prevented from */
368 /* starting a new grace */
369 /* period because */
370 /* force_quiescent_state() */
371 /* was running. */
396 u8 boost; /* Subject to priority boost. */ 372 u8 boost; /* Subject to priority boost. */
397 unsigned long gpnum; /* Current gp number. */ 373 unsigned long gpnum; /* Current gp number. */
398 unsigned long completed; /* # of last completed gp. */ 374 unsigned long completed; /* # of last completed gp. */
399 struct task_struct *gp_kthread; /* Task for grace periods. */
400 wait_queue_head_t gp_wq; /* Where GP task waits. */
401 int gp_flags; /* Commands for GP task. */
402 375
403 /* End of fields guarded by root rcu_node's lock. */ 376 /* End of fields guarded by root rcu_node's lock. */
404 377
405 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; 378 raw_spinlock_t onofflock; /* exclude on/offline and */
406 /* Protect following fields. */ 379 /* starting new GP. */
407 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 380 raw_spinlock_t fqslock; /* Only one task forcing */
408 /* need a grace period. */ 381 /* quiescent states. */
409 struct rcu_head **orphan_nxttail; /* Tail of above. */
410 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
411 /* are ready to invoke. */
412 struct rcu_head **orphan_donetail; /* Tail of above. */
413 long qlen_lazy; /* Number of lazy callbacks. */
414 long qlen; /* Total number of callbacks. */
415 /* End of fields guarded by orphan_lock. */
416
417 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
418
419 struct mutex barrier_mutex; /* Guards barrier fields. */
420 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
421 struct completion barrier_completion; /* Wake at barrier end. */
422 unsigned long n_barrier_done; /* ++ at start and end of */
423 /* _rcu_barrier(). */
424 /* End of fields guarded by barrier_mutex. */
425
426 atomic_long_t expedited_start; /* Starting ticket. */
427 atomic_long_t expedited_done; /* Done ticket. */
428 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
429 atomic_long_t expedited_tryfail; /* # acquisition failures. */
430 atomic_long_t expedited_workdone1; /* # done by others #1. */
431 atomic_long_t expedited_workdone2; /* # done by others #2. */
432 atomic_long_t expedited_normal; /* # fallbacks to normal. */
433 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
434 atomic_long_t expedited_done_tries; /* # tries to update _done. */
435 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
436 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
437
438 unsigned long jiffies_force_qs; /* Time at which to invoke */ 382 unsigned long jiffies_force_qs; /* Time at which to invoke */
439 /* force_quiescent_state(). */ 383 /* force_quiescent_state(). */
440 unsigned long n_force_qs; /* Number of calls to */ 384 unsigned long n_force_qs; /* Number of calls to */
@@ -450,19 +394,8 @@ struct rcu_state {
450 unsigned long gp_max; /* Maximum GP duration in */ 394 unsigned long gp_max; /* Maximum GP duration in */
451 /* jiffies. */ 395 /* jiffies. */
452 char *name; /* Name of structure. */ 396 char *name; /* Name of structure. */
453 struct list_head flavors; /* List of RCU flavors. */
454}; 397};
455 398
456/* Values for rcu_state structure's gp_flags field. */
457#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
458#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
459
460extern struct list_head rcu_struct_flavors;
461
462/* Sequence through rcu_state structures for each RCU flavor. */
463#define for_each_rcu_flavor(rsp) \
464 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
465
466/* Return values for rcu_preempt_offline_tasks(). */ 399/* Return values for rcu_preempt_offline_tasks(). */
467 400
468#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ 401#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -484,13 +417,6 @@ extern struct rcu_state rcu_preempt_state;
484DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 417DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
485#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 418#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
486 419
487#ifdef CONFIG_RCU_BOOST
488DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
489DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
490DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
491DECLARE_PER_CPU(char, rcu_cpu_has_work);
492#endif /* #ifdef CONFIG_RCU_BOOST */
493
494#ifndef RCU_TREE_NONCORE 420#ifndef RCU_TREE_NONCORE
495 421
496/* Forward declarations for rcutree_plugin.h */ 422/* Forward declarations for rcutree_plugin.h */
@@ -501,67 +427,44 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
501#ifdef CONFIG_HOTPLUG_CPU 427#ifdef CONFIG_HOTPLUG_CPU
502static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 428static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
503 unsigned long flags); 429 unsigned long flags);
430static void rcu_stop_cpu_kthread(int cpu);
504#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 431#endif /* #ifdef CONFIG_HOTPLUG_CPU */
505static void rcu_print_detail_task_stall(struct rcu_state *rsp); 432static void rcu_print_detail_task_stall(struct rcu_state *rsp);
506static int rcu_print_task_stall(struct rcu_node *rnp); 433static void rcu_print_task_stall(struct rcu_node *rnp);
434static void rcu_preempt_stall_reset(void);
507static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
508#ifdef CONFIG_HOTPLUG_CPU 436#ifdef CONFIG_HOTPLUG_CPU
509static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 437static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
510 struct rcu_node *rnp, 438 struct rcu_node *rnp,
511 struct rcu_data *rdp); 439 struct rcu_data *rdp);
440static void rcu_preempt_offline_cpu(int cpu);
512#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 441#endif /* #ifdef CONFIG_HOTPLUG_CPU */
513static void rcu_preempt_check_callbacks(int cpu); 442static void rcu_preempt_check_callbacks(int cpu);
443static void rcu_preempt_process_callbacks(void);
514void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 444void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
515#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 445#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
516static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 446static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
517 bool wake);
518#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 447#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
448static int rcu_preempt_pending(int cpu);
449static int rcu_preempt_needs_cpu(int cpu);
450static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
451static void rcu_preempt_send_cbs_to_online(void);
519static void __init __rcu_init_preempt(void); 452static void __init __rcu_init_preempt(void);
453static void rcu_needs_cpu_flush(void);
520static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
521static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
522static void invoke_rcu_callbacks_kthread(void); 456static void invoke_rcu_callbacks_kthread(void);
523static bool rcu_is_callbacks_kthread(void);
524#ifdef CONFIG_RCU_BOOST 457#ifdef CONFIG_RCU_BOOST
525static void rcu_preempt_do_callbacks(void); 458static void rcu_preempt_do_callbacks(void);
459static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
460 cpumask_var_t cm);
526static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 461static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
527 struct rcu_node *rnp); 462 struct rcu_node *rnp,
463 int rnp_index);
464static void invoke_rcu_node_kthread(struct rcu_node *rnp);
465static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
528#endif /* #ifdef CONFIG_RCU_BOOST */ 466#endif /* #ifdef CONFIG_RCU_BOOST */
467static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
529static void __cpuinit rcu_prepare_kthreads(int cpu); 468static void __cpuinit rcu_prepare_kthreads(int cpu);
530static void rcu_prepare_for_idle_init(int cpu);
531static void rcu_cleanup_after_idle(int cpu);
532static void rcu_prepare_for_idle(int cpu);
533static void rcu_idle_count_callbacks_posted(void);
534static void print_cpu_stall_info_begin(void);
535static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
536static void print_cpu_stall_info_end(void);
537static void zero_cpu_stall_ticks(struct rcu_data *rdp);
538static void increment_cpu_stall_ticks(void);
539static bool is_nocb_cpu(int cpu);
540static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
541 bool lazy);
542static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
543 struct rcu_data *rdp);
544static bool nocb_cpu_expendable(int cpu);
545static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
546static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
547static void init_nocb_callback_list(struct rcu_data *rdp);
548static void __init rcu_init_nocb(void);
549 469
550#endif /* #ifndef RCU_TREE_NONCORE */ 470#endif /* #ifndef RCU_TREE_NONCORE */
551
552#ifdef CONFIG_RCU_TRACE
553#ifdef CONFIG_RCU_NOCB_CPU
554/* Sum up queue lengths for tracing. */
555static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
556{
557 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
558 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
559}
560#else /* #ifdef CONFIG_RCU_NOCB_CPU */
561static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
562{
563 *ql = 0;
564 *qll = 0;
565}
566#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
567#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f6e5ec2932b..8aafbb80b8b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,25 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h> 28#include <linux/stop_machine.h>
29#include <linux/oom.h>
30#include <linux/smpboot.h>
31
32#define RCU_KTHREAD_PRIO 1
33
34#ifdef CONFIG_RCU_BOOST
35#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
36#else
37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
38#endif
39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47 29
48/* 30/*
49 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -73,39 +55,20 @@ static void __init rcu_bootup_announce_oddness(void)
73 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
74#endif 56#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
77#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
80#endif 59#endif
81#if NUM_RCU_LVL_4 != 0 60#if NUM_RCU_LVL_4 != 0
82 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); 61 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
83#endif 62#endif
84 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS)
87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89 if (have_rcu_nocb_mask) {
90 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
91 cpumask_clear_cpu(0, rcu_nocb_mask);
92 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
93 }
94 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
95 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
96 if (rcu_nocb_poll)
97 pr_info("\tExperimental polled no-CBs CPUs.\n");
98 }
99#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
100} 63}
101 64
102#ifdef CONFIG_TREE_PREEMPT_RCU 65#ifdef CONFIG_TREE_PREEMPT_RCU
103 66
104struct rcu_state rcu_preempt_state = 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
105 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
106DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
107static struct rcu_state *rcu_state = &rcu_preempt_state; 69static struct rcu_state *rcu_state = &rcu_preempt_state;
108 70
71static void rcu_read_unlock_special(struct task_struct *t);
109static int rcu_preempted_readers_exp(struct rcu_node *rnp); 72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
110 73
111/* 74/*
@@ -141,7 +104,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
141 */ 104 */
142void rcu_force_quiescent_state(void) 105void rcu_force_quiescent_state(void)
143{ 106{
144 force_quiescent_state(&rcu_preempt_state); 107 force_quiescent_state(&rcu_preempt_state, 0);
145} 108}
146EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 109EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
147 110
@@ -159,9 +122,9 @@ static void rcu_preempt_qs(int cpu)
159{ 122{
160 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 123 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
161 124
162 if (rdp->passed_quiesce == 0) 125 rdp->passed_quiesc_completed = rdp->gpnum - 1;
163 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 126 barrier();
164 rdp->passed_quiesce = 1; 127 rdp->passed_quiesc = 1;
165 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 128 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
166} 129}
167 130
@@ -227,11 +190,6 @@ static void rcu_preempt_note_context_switch(int cpu)
227 if (rnp->qsmask & rdp->grpmask) 190 if (rnp->qsmask & rdp->grpmask)
228 rnp->gp_tasks = &t->rcu_node_entry; 191 rnp->gp_tasks = &t->rcu_node_entry;
229 } 192 }
230 trace_rcu_preempt_task(rdp->rsp->name,
231 t->pid,
232 (rnp->qsmask & rdp->grpmask)
233 ? rnp->gpnum
234 : rnp->gpnum + 1);
235 raw_spin_unlock_irqrestore(&rnp->lock, flags); 193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
236 } else if (t->rcu_read_lock_nesting < 0 && 194 } else if (t->rcu_read_lock_nesting < 0 &&
237 t->rcu_read_unlock_special) { 195 t->rcu_read_unlock_special) {
@@ -258,6 +216,18 @@ static void rcu_preempt_note_context_switch(int cpu)
258} 216}
259 217
260/* 218/*
219 * Tree-preemptible RCU implementation for rcu_read_lock().
220 * Just increment ->rcu_read_lock_nesting, shared state will be updated
221 * if we block.
222 */
223void __rcu_read_lock(void)
224{
225 current->rcu_read_lock_nesting++;
226 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
227}
228EXPORT_SYMBOL_GPL(__rcu_read_lock);
229
230/*
261 * Check for preempted RCU readers blocking the current grace period 231 * Check for preempted RCU readers blocking the current grace period
262 * for the specified rcu_node structure. If the caller needs a reliable 232 * for the specified rcu_node structure. If the caller needs a reliable
263 * answer, it must hold the rcu_node's ->lock. 233 * answer, it must hold the rcu_node's ->lock.
@@ -323,16 +293,12 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
323 * notify RCU core processing or task having blocked during the RCU 293 * notify RCU core processing or task having blocked during the RCU
324 * read-side critical section. 294 * read-side critical section.
325 */ 295 */
326void rcu_read_unlock_special(struct task_struct *t) 296static noinline void rcu_read_unlock_special(struct task_struct *t)
327{ 297{
328 int empty; 298 int empty;
329 int empty_exp; 299 int empty_exp;
330 int empty_exp_now;
331 unsigned long flags; 300 unsigned long flags;
332 struct list_head *np; 301 struct list_head *np;
333#ifdef CONFIG_RCU_BOOST
334 struct rt_mutex *rbmp = NULL;
335#endif /* #ifdef CONFIG_RCU_BOOST */
336 struct rcu_node *rnp; 302 struct rcu_node *rnp;
337 int special; 303 int special;
338 304
@@ -378,9 +344,6 @@ void rcu_read_unlock_special(struct task_struct *t)
378 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
379 np = rcu_next_node_entry(t, rnp); 345 np = rcu_next_node_entry(t, rnp);
380 list_del_init(&t->rcu_node_entry); 346 list_del_init(&t->rcu_node_entry);
381 t->rcu_blocked_node = NULL;
382 trace_rcu_unlock_preempted_task("rcu_preempt",
383 rnp->gpnum, t->pid);
384 if (&t->rcu_node_entry == rnp->gp_tasks) 347 if (&t->rcu_node_entry == rnp->gp_tasks)
385 rnp->gp_tasks = np; 348 rnp->gp_tasks = np;
386 if (&t->rcu_node_entry == rnp->exp_tasks) 349 if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -388,50 +351,75 @@ void rcu_read_unlock_special(struct task_struct *t)
388#ifdef CONFIG_RCU_BOOST 351#ifdef CONFIG_RCU_BOOST
389 if (&t->rcu_node_entry == rnp->boost_tasks) 352 if (&t->rcu_node_entry == rnp->boost_tasks)
390 rnp->boost_tasks = np; 353 rnp->boost_tasks = np;
391 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ 354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
392 if (t->rcu_boost_mutex) { 355 if (t->rcu_boosted) {
393 rbmp = t->rcu_boost_mutex; 356 special |= RCU_READ_UNLOCK_BOOSTED;
394 t->rcu_boost_mutex = NULL; 357 t->rcu_boosted = 0;
395 } 358 }
396#endif /* #ifdef CONFIG_RCU_BOOST */ 359#endif /* #ifdef CONFIG_RCU_BOOST */
360 t->rcu_blocked_node = NULL;
397 361
398 /* 362 /*
399 * If this was the last task on the current list, and if 363 * If this was the last task on the current list, and if
400 * we aren't waiting on any CPUs, report the quiescent state. 364 * we aren't waiting on any CPUs, report the quiescent state.
401 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 365 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
402 * so we must take a snapshot of the expedited state.
403 */ 366 */
404 empty_exp_now = !rcu_preempted_readers_exp(rnp); 367 if (empty)
405 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
406 trace_rcu_quiescent_state_report("preempt_rcu",
407 rnp->gpnum,
408 0, rnp->qsmask,
409 rnp->level,
410 rnp->grplo,
411 rnp->grphi,
412 !!rnp->gp_tasks);
413 rcu_report_unblock_qs_rnp(rnp, flags);
414 } else {
415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 368 raw_spin_unlock_irqrestore(&rnp->lock, flags);
416 } 369 else
370 rcu_report_unblock_qs_rnp(rnp, flags);
417 371
418#ifdef CONFIG_RCU_BOOST 372#ifdef CONFIG_RCU_BOOST
419 /* Unboost if we were boosted. */ 373 /* Unboost if we were boosted. */
420 if (rbmp) 374 if (special & RCU_READ_UNLOCK_BOOSTED) {
421 rt_mutex_unlock(rbmp); 375 rt_mutex_unlock(t->rcu_boost_mutex);
376 t->rcu_boost_mutex = NULL;
377 }
422#endif /* #ifdef CONFIG_RCU_BOOST */ 378#endif /* #ifdef CONFIG_RCU_BOOST */
423 379
424 /* 380 /*
425 * If this was the last task on the expedited lists, 381 * If this was the last task on the expedited lists,
426 * then we need to report up the rcu_node hierarchy. 382 * then we need to report up the rcu_node hierarchy.
427 */ 383 */
428 if (!empty_exp && empty_exp_now) 384 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
429 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); 385 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
430 } else { 386 } else {
431 local_irq_restore(flags); 387 local_irq_restore(flags);
432 } 388 }
433} 389}
434 390
391/*
392 * Tree-preemptible RCU implementation for rcu_read_unlock().
393 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
394 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
395 * invoke rcu_read_unlock_special() to clean up after a context switch
396 * in an RCU read-side critical section and other special cases.
397 */
398void __rcu_read_unlock(void)
399{
400 struct task_struct *t = current;
401
402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
403 if (t->rcu_read_lock_nesting != 1)
404 --t->rcu_read_lock_nesting;
405 else {
406 t->rcu_read_lock_nesting = INT_MIN;
407 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
409 rcu_read_unlock_special(t);
410 barrier(); /* ->rcu_read_unlock_special load before assign */
411 t->rcu_read_lock_nesting = 0;
412 }
413#ifdef CONFIG_PROVE_LOCKING
414 {
415 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
416
417 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
418 }
419#endif /* #ifdef CONFIG_PROVE_LOCKING */
420}
421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
422
435#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 423#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
436 424
437/* 425/*
@@ -443,11 +431,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
443 unsigned long flags; 431 unsigned long flags;
444 struct task_struct *t; 432 struct task_struct *t;
445 433
446 raw_spin_lock_irqsave(&rnp->lock, flags); 434 if (!rcu_preempt_blocked_readers_cgp(rnp))
447 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
448 raw_spin_unlock_irqrestore(&rnp->lock, flags);
449 return; 435 return;
450 } 436 raw_spin_lock_irqsave(&rnp->lock, flags);
451 t = list_entry(rnp->gp_tasks, 437 t = list_entry(rnp->gp_tasks,
452 struct task_struct, rcu_node_entry); 438 struct task_struct, rcu_node_entry);
453 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 439 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -476,51 +462,30 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
476 462
477#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 463#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
478 464
479#ifdef CONFIG_RCU_CPU_STALL_INFO
480
481static void rcu_print_task_stall_begin(struct rcu_node *rnp)
482{
483 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
484 rnp->level, rnp->grplo, rnp->grphi);
485}
486
487static void rcu_print_task_stall_end(void)
488{
489 printk(KERN_CONT "\n");
490}
491
492#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
493
494static void rcu_print_task_stall_begin(struct rcu_node *rnp)
495{
496}
497
498static void rcu_print_task_stall_end(void)
499{
500}
501
502#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
503
504/* 465/*
505 * Scan the current list of tasks blocked within RCU read-side critical 466 * Scan the current list of tasks blocked within RCU read-side critical
506 * sections, printing out the tid of each. 467 * sections, printing out the tid of each.
507 */ 468 */
508static int rcu_print_task_stall(struct rcu_node *rnp) 469static void rcu_print_task_stall(struct rcu_node *rnp)
509{ 470{
510 struct task_struct *t; 471 struct task_struct *t;
511 int ndetected = 0;
512 472
513 if (!rcu_preempt_blocked_readers_cgp(rnp)) 473 if (!rcu_preempt_blocked_readers_cgp(rnp))
514 return 0; 474 return;
515 rcu_print_task_stall_begin(rnp);
516 t = list_entry(rnp->gp_tasks, 475 t = list_entry(rnp->gp_tasks,
517 struct task_struct, rcu_node_entry); 476 struct task_struct, rcu_node_entry);
518 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
519 printk(KERN_CONT " P%d", t->pid); 478 printk(" P%d", t->pid);
520 ndetected++; 479}
521 } 480
522 rcu_print_task_stall_end(); 481/*
523 return ndetected; 482 * Suppress preemptible RCU's CPU stall warnings by pushing the
483 * time of the next stall-warning message comfortably far into the
484 * future.
485 */
486static void rcu_preempt_stall_reset(void)
487{
488 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
524} 489}
525 490
526/* 491/*
@@ -584,7 +549,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
584 * absolutely necessary, but this is a good performance/complexity 549 * absolutely necessary, but this is a good performance/complexity
585 * tradeoff. 550 * tradeoff.
586 */ 551 */
587 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) 552 if (rcu_preempt_blocked_readers_cgp(rnp))
588 retval |= RCU_OFL_TASKS_NORM_GP; 553 retval |= RCU_OFL_TASKS_NORM_GP;
589 if (rcu_preempted_readers_exp(rnp)) 554 if (rcu_preempted_readers_exp(rnp))
590 retval |= RCU_OFL_TASKS_EXP_GP; 555 retval |= RCU_OFL_TASKS_EXP_GP;
@@ -607,26 +572,28 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
607 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 572 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
608 } 573 }
609 574
610 rnp->gp_tasks = NULL;
611 rnp->exp_tasks = NULL;
612#ifdef CONFIG_RCU_BOOST 575#ifdef CONFIG_RCU_BOOST
613 rnp->boost_tasks = NULL; 576 /* In case root is being boosted and leaf is not. */
614 /*
615 * In case root is being boosted and leaf was not. Make sure
616 * that we boost the tasks blocking the current grace period
617 * in this case.
618 */
619 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 577 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
620 if (rnp_root->boost_tasks != NULL && 578 if (rnp_root->boost_tasks != NULL &&
621 rnp_root->boost_tasks != rnp_root->gp_tasks && 579 rnp_root->boost_tasks != rnp_root->gp_tasks)
622 rnp_root->boost_tasks != rnp_root->exp_tasks)
623 rnp_root->boost_tasks = rnp_root->gp_tasks; 580 rnp_root->boost_tasks = rnp_root->gp_tasks;
624 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 581 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
625#endif /* #ifdef CONFIG_RCU_BOOST */ 582#endif /* #ifdef CONFIG_RCU_BOOST */
626 583
584 rnp->gp_tasks = NULL;
585 rnp->exp_tasks = NULL;
627 return retval; 586 return retval;
628} 587}
629 588
589/*
590 * Do CPU-offline processing for preemptible RCU.
591 */
592static void rcu_preempt_offline_cpu(int cpu)
593{
594 __rcu_offline_cpu(cpu, &rcu_preempt_state);
595}
596
630#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 597#endif /* #ifdef CONFIG_HOTPLUG_CPU */
631 598
632/* 599/*
@@ -649,6 +616,15 @@ static void rcu_preempt_check_callbacks(int cpu)
649 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 616 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
650} 617}
651 618
619/*
620 * Process callbacks for preemptible RCU.
621 */
622static void rcu_preempt_process_callbacks(void)
623{
624 __rcu_process_callbacks(&rcu_preempt_state,
625 &__get_cpu_var(rcu_preempt_data));
626}
627
652#ifdef CONFIG_RCU_BOOST 628#ifdef CONFIG_RCU_BOOST
653 629
654static void rcu_preempt_do_callbacks(void) 630static void rcu_preempt_do_callbacks(void)
@@ -663,24 +639,10 @@ static void rcu_preempt_do_callbacks(void)
663 */ 639 */
664void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 640void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
665{ 641{
666 __call_rcu(head, func, &rcu_preempt_state, -1, 0); 642 __call_rcu(head, func, &rcu_preempt_state);
667} 643}
668EXPORT_SYMBOL_GPL(call_rcu); 644EXPORT_SYMBOL_GPL(call_rcu);
669 645
670/*
671 * Queue an RCU callback for lazy invocation after a grace period.
672 * This will likely be later named something like "call_rcu_lazy()",
673 * but this change will require some way of tagging the lazy RCU
674 * callbacks in the list of pending callbacks. Until then, this
675 * function may only be called from __kfree_rcu().
676 */
677void kfree_call_rcu(struct rcu_head *head,
678 void (*func)(struct rcu_head *rcu))
679{
680 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
681}
682EXPORT_SYMBOL_GPL(kfree_call_rcu);
683
684/** 646/**
685 * synchronize_rcu - wait until a grace period has elapsed. 647 * synchronize_rcu - wait until a grace period has elapsed.
686 * 648 *
@@ -691,27 +653,26 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
691 * concurrently with new RCU read-side critical sections that began while 653 * concurrently with new RCU read-side critical sections that began while
692 * synchronize_rcu() was waiting. RCU read-side critical sections are 654 * synchronize_rcu() was waiting. RCU read-side critical sections are
693 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 655 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
694 *
695 * See the description of synchronize_sched() for more detailed information
696 * on memory ordering guarantees.
697 */ 656 */
698void synchronize_rcu(void) 657void synchronize_rcu(void)
699{ 658{
700 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 659 struct rcu_synchronize rcu;
701 !lock_is_held(&rcu_lock_map) && 660
702 !lock_is_held(&rcu_sched_lock_map),
703 "Illegal synchronize_rcu() in RCU read-side critical section");
704 if (!rcu_scheduler_active) 661 if (!rcu_scheduler_active)
705 return; 662 return;
706 if (rcu_expedited) 663
707 synchronize_rcu_expedited(); 664 init_rcu_head_on_stack(&rcu.head);
708 else 665 init_completion(&rcu.completion);
709 wait_rcu_gp(call_rcu); 666 /* Will wake me after RCU finished. */
667 call_rcu(&rcu.head, wakeme_after_rcu);
668 /* Wait for it. */
669 wait_for_completion(&rcu.completion);
670 destroy_rcu_head_on_stack(&rcu.head);
710} 671}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 672EXPORT_SYMBOL_GPL(synchronize_rcu);
712 673
713static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 674static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
714static unsigned long sync_rcu_preempt_exp_count; 675static long sync_rcu_preempt_exp_count;
715static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 676static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
716 677
717/* 678/*
@@ -748,13 +709,9 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
748 * recursively up the tree. (Calm down, calm down, we do the recursion 709 * recursively up the tree. (Calm down, calm down, we do the recursion
749 * iteratively!) 710 * iteratively!)
750 * 711 *
751 * Most callers will set the "wake" flag, but the task initiating the
752 * expedited grace period need not wake itself.
753 *
754 * Caller must hold sync_rcu_preempt_exp_mutex. 712 * Caller must hold sync_rcu_preempt_exp_mutex.
755 */ 713 */
756static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 714static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
757 bool wake)
758{ 715{
759 unsigned long flags; 716 unsigned long flags;
760 unsigned long mask; 717 unsigned long mask;
@@ -767,8 +724,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
767 } 724 }
768 if (rnp->parent == NULL) { 725 if (rnp->parent == NULL) {
769 raw_spin_unlock_irqrestore(&rnp->lock, flags); 726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
770 if (wake) 727 wake_up(&sync_rcu_preempt_exp_wq);
771 wake_up(&sync_rcu_preempt_exp_wq);
772 break; 728 break;
773 } 729 }
774 mask = rnp->grpmask; 730 mask = rnp->grpmask;
@@ -784,8 +740,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
784 * grace period for the specified rcu_node structure. If there are no such 740 * grace period for the specified rcu_node structure. If there are no such
785 * tasks, report it up the rcu_node hierarchy. 741 * tasks, report it up the rcu_node hierarchy.
786 * 742 *
787 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 743 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
788 * CPU hotplug operations.
789 */ 744 */
790static void 745static void
791sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 746sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -794,40 +749,28 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
794 int must_wait = 0; 749 int must_wait = 0;
795 750
796 raw_spin_lock_irqsave(&rnp->lock, flags); 751 raw_spin_lock_irqsave(&rnp->lock, flags);
797 if (list_empty(&rnp->blkd_tasks)) { 752 if (list_empty(&rnp->blkd_tasks))
798 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
799 } else { 754 else {
800 rnp->exp_tasks = rnp->blkd_tasks.next; 755 rnp->exp_tasks = rnp->blkd_tasks.next;
801 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 756 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
802 must_wait = 1; 757 must_wait = 1;
803 } 758 }
804 if (!must_wait) 759 if (!must_wait)
805 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 760 rcu_report_exp_rnp(rsp, rnp);
806} 761}
807 762
808/** 763/*
809 * synchronize_rcu_expedited - Brute-force RCU grace period 764 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
810 * 765 * is to invoke synchronize_sched_expedited() to push all the tasks to
811 * Wait for an RCU-preempt grace period, but expedite it. The basic 766 * the ->blkd_tasks lists and wait for this list to drain.
812 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
813 * the ->blkd_tasks lists and wait for this list to drain. This consumes
814 * significant time on all CPUs and is unfriendly to real-time workloads,
815 * so is thus not recommended for any sort of common-case code.
816 * In fact, if you are using synchronize_rcu_expedited() in a loop,
817 * please restructure your code to batch your updates, and then Use a
818 * single synchronize_rcu() instead.
819 *
820 * Note that it is illegal to call this function while holding any lock
821 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
822 * to call this function from a CPU-hotplug notifier. Failing to observe
823 * these restriction will result in deadlock.
824 */ 767 */
825void synchronize_rcu_expedited(void) 768void synchronize_rcu_expedited(void)
826{ 769{
827 unsigned long flags; 770 unsigned long flags;
828 struct rcu_node *rnp; 771 struct rcu_node *rnp;
829 struct rcu_state *rsp = &rcu_preempt_state; 772 struct rcu_state *rsp = &rcu_preempt_state;
830 unsigned long snap; 773 long snap;
831 int trycount = 0; 774 int trycount = 0;
832 775
833 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 776 smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -835,47 +778,33 @@ void synchronize_rcu_expedited(void)
835 smp_mb(); /* Above access cannot bleed into critical section. */ 778 smp_mb(); /* Above access cannot bleed into critical section. */
836 779
837 /* 780 /*
838 * Block CPU-hotplug operations. This means that any CPU-hotplug
839 * operation that finds an rcu_node structure with tasks in the
840 * process of being boosted will know that all tasks blocking
841 * this expedited grace period will already be in the process of
842 * being boosted. This simplifies the process of moving tasks
843 * from leaf to root rcu_node structures.
844 */
845 get_online_cpus();
846
847 /*
848 * Acquire lock, falling back to synchronize_rcu() if too many 781 * Acquire lock, falling back to synchronize_rcu() if too many
849 * lock-acquisition failures. Of course, if someone does the 782 * lock-acquisition failures. Of course, if someone does the
850 * expedited grace period for us, just leave. 783 * expedited grace period for us, just leave.
851 */ 784 */
852 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 785 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
853 if (ULONG_CMP_LT(snap, 786 if (trycount++ < 10)
854 ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
855 put_online_cpus();
856 goto mb_ret; /* Others did our work for us. */
857 }
858 if (trycount++ < 10) {
859 udelay(trycount * num_online_cpus()); 787 udelay(trycount * num_online_cpus());
860 } else { 788 else {
861 put_online_cpus(); 789 synchronize_rcu();
862 wait_rcu_gp(call_rcu);
863 return; 790 return;
864 } 791 }
792 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
793 goto mb_ret; /* Others did our work for us. */
865 } 794 }
866 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { 795 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
867 put_online_cpus();
868 goto unlock_mb_ret; /* Others did our work for us. */ 796 goto unlock_mb_ret; /* Others did our work for us. */
869 }
870 797
871 /* force all RCU readers onto ->blkd_tasks lists. */ 798 /* force all RCU readers onto ->blkd_tasks lists. */
872 synchronize_sched_expedited(); 799 synchronize_sched_expedited();
873 800
801 raw_spin_lock_irqsave(&rsp->onofflock, flags);
802
874 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 803 /* Initialize ->expmask for all non-leaf rcu_node structures. */
875 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 804 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
876 raw_spin_lock_irqsave(&rnp->lock, flags); 805 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
877 rnp->expmask = rnp->qsmaskinit; 806 rnp->expmask = rnp->qsmaskinit;
878 raw_spin_unlock_irqrestore(&rnp->lock, flags); 807 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
879 } 808 }
880 809
881 /* Snapshot current state of ->blkd_tasks lists. */ 810 /* Snapshot current state of ->blkd_tasks lists. */
@@ -884,7 +813,7 @@ void synchronize_rcu_expedited(void)
884 if (NUM_RCU_NODES > 1) 813 if (NUM_RCU_NODES > 1)
885 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 814 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
886 815
887 put_online_cpus(); 816 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
888 817
889 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 818 /* Wait for snapshotted ->blkd_tasks lists to drain. */
890 rnp = rcu_get_root(rsp); 819 rnp = rcu_get_root(rsp);
@@ -901,21 +830,50 @@ mb_ret:
901} 830}
902EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 831EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
903 832
833/*
834 * Check to see if there is any immediate preemptible-RCU-related work
835 * to be done.
836 */
837static int rcu_preempt_pending(int cpu)
838{
839 return __rcu_pending(&rcu_preempt_state,
840 &per_cpu(rcu_preempt_data, cpu));
841}
842
843/*
844 * Does preemptible RCU need the CPU to stay out of dynticks mode?
845 */
846static int rcu_preempt_needs_cpu(int cpu)
847{
848 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
849}
850
904/** 851/**
905 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 852 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
906 *
907 * Note that this primitive does not necessarily wait for an RCU grace period
908 * to complete. For example, if there are no RCU callbacks queued anywhere
909 * in the system, then rcu_barrier() is within its rights to return
910 * immediately, without waiting for anything, much less an RCU grace period.
911 */ 853 */
912void rcu_barrier(void) 854void rcu_barrier(void)
913{ 855{
914 _rcu_barrier(&rcu_preempt_state); 856 _rcu_barrier(&rcu_preempt_state, call_rcu);
915} 857}
916EXPORT_SYMBOL_GPL(rcu_barrier); 858EXPORT_SYMBOL_GPL(rcu_barrier);
917 859
918/* 860/*
861 * Initialize preemptible RCU's per-CPU data.
862 */
863static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
864{
865 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
866}
867
868/*
869 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
870 */
871static void rcu_preempt_send_cbs_to_online(void)
872{
873 rcu_send_cbs_to_online(&rcu_preempt_state);
874}
875
876/*
919 * Initialize preemptible RCU's state structures. 877 * Initialize preemptible RCU's state structures.
920 */ 878 */
921static void __init __rcu_init_preempt(void) 879static void __init __rcu_init_preempt(void)
@@ -923,6 +881,22 @@ static void __init __rcu_init_preempt(void)
923 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 881 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
924} 882}
925 883
884/*
885 * Check for a task exiting while in a preemptible-RCU read-side
886 * critical section, clean up if so. No need to issue warnings,
887 * as debug_check_no_locks_held() already does this if lockdep
888 * is enabled.
889 */
890void exit_rcu(void)
891{
892 struct task_struct *t = current;
893
894 if (t->rcu_read_lock_nesting == 0)
895 return;
896 t->rcu_read_lock_nesting = 1;
897 __rcu_read_unlock();
898}
899
926#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 900#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
927 901
928static struct rcu_state *rcu_state = &rcu_sched_state; 902static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -994,9 +968,16 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
994 * Because preemptible RCU does not exist, we never have to check for 968 * Because preemptible RCU does not exist, we never have to check for
995 * tasks blocked within RCU read-side critical sections. 969 * tasks blocked within RCU read-side critical sections.
996 */ 970 */
997static int rcu_print_task_stall(struct rcu_node *rnp) 971static void rcu_print_task_stall(struct rcu_node *rnp)
972{
973}
974
975/*
976 * Because preemptible RCU does not exist, there is no need to suppress
977 * its CPU stall warnings.
978 */
979static void rcu_preempt_stall_reset(void)
998{ 980{
999 return 0;
1000} 981}
1001 982
1002/* 983/*
@@ -1024,6 +1005,14 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1024 return 0; 1005 return 0;
1025} 1006}
1026 1007
1008/*
1009 * Because preemptible RCU does not exist, it never needs CPU-offline
1010 * processing.
1011 */
1012static void rcu_preempt_offline_cpu(int cpu)
1013{
1014}
1015
1027#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1016#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1028 1017
1029/* 1018/*
@@ -1035,20 +1024,12 @@ static void rcu_preempt_check_callbacks(int cpu)
1035} 1024}
1036 1025
1037/* 1026/*
1038 * Queue an RCU callback for lazy invocation after a grace period. 1027 * Because preemptible RCU does not exist, it never has any callbacks
1039 * This will likely be later named something like "call_rcu_lazy()", 1028 * to process.
1040 * but this change will require some way of tagging the lazy RCU
1041 * callbacks in the list of pending callbacks. Until then, this
1042 * function may only be called from __kfree_rcu().
1043 *
1044 * Because there is no preemptible RCU, we use RCU-sched instead.
1045 */ 1029 */
1046void kfree_call_rcu(struct rcu_head *head, 1030static void rcu_preempt_process_callbacks(void)
1047 void (*func)(struct rcu_head *rcu))
1048{ 1031{
1049 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1050} 1032}
1051EXPORT_SYMBOL_GPL(kfree_call_rcu);
1052 1033
1053/* 1034/*
1054 * Wait for an rcu-preempt grace period, but make it happen quickly. 1035 * Wait for an rcu-preempt grace period, but make it happen quickly.
@@ -1067,14 +1048,30 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1067 * report on tasks preempted in RCU read-side critical sections during 1048 * report on tasks preempted in RCU read-side critical sections during
1068 * expedited RCU grace periods. 1049 * expedited RCU grace periods.
1069 */ 1050 */
1070static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 1051static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
1071 bool wake)
1072{ 1052{
1053 return;
1073} 1054}
1074 1055
1075#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1076 1057
1077/* 1058/*
1059 * Because preemptible RCU does not exist, it never has any work to do.
1060 */
1061static int rcu_preempt_pending(int cpu)
1062{
1063 return 0;
1064}
1065
1066/*
1067 * Because preemptible RCU does not exist, it never needs any CPU.
1068 */
1069static int rcu_preempt_needs_cpu(int cpu)
1070{
1071 return 0;
1072}
1073
1074/*
1078 * Because preemptible RCU does not exist, rcu_barrier() is just 1075 * Because preemptible RCU does not exist, rcu_barrier() is just
1079 * another name for rcu_barrier_sched(). 1076 * another name for rcu_barrier_sched().
1080 */ 1077 */
@@ -1085,6 +1082,21 @@ void rcu_barrier(void)
1085EXPORT_SYMBOL_GPL(rcu_barrier); 1082EXPORT_SYMBOL_GPL(rcu_barrier);
1086 1083
1087/* 1084/*
1085 * Because preemptible RCU does not exist, there is no per-CPU
1086 * data to initialize.
1087 */
1088static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1089{
1090}
1091
1092/*
1093 * Because there is no preemptible RCU, there are no callbacks to move.
1094 */
1095static void rcu_preempt_send_cbs_to_online(void)
1096{
1097}
1098
1099/*
1088 * Because preemptible RCU does not exist, it need not be initialized. 1100 * Because preemptible RCU does not exist, it need not be initialized.
1089 */ 1101 */
1090static void __init __rcu_init_preempt(void) 1102static void __init __rcu_init_preempt(void)
@@ -1124,16 +1136,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1124 1136
1125#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1137#endif /* #else #ifdef CONFIG_RCU_TRACE */
1126 1138
1127static void rcu_wake_cond(struct task_struct *t, int status)
1128{
1129 /*
1130 * If the thread is yielding, only wake it when this
1131 * is invoked from idle
1132 */
1133 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1134 wake_up_process(t);
1135}
1136
1137/* 1139/*
1138 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1139 * or ->boost_tasks, advancing the pointer to the next task in the 1141 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1197,12 +1199,23 @@ static int rcu_boost(struct rcu_node *rnp)
1197 t = container_of(tb, struct task_struct, rcu_node_entry); 1199 t = container_of(tb, struct task_struct, rcu_node_entry);
1198 rt_mutex_init_proxy_locked(&mtx, t); 1200 rt_mutex_init_proxy_locked(&mtx, t);
1199 t->rcu_boost_mutex = &mtx; 1201 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1200 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1203 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1201 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1202 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1203 1206
1204 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1207 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1205 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1208}
1209
1210/*
1211 * Timer handler to initiate waking up of boost kthreads that
1212 * have yielded the CPU due to excessive numbers of tasks to
1213 * boost. We wake up the per-rcu_node kthread, which in turn
1214 * will wake up the booster kthread.
1215 */
1216static void rcu_boost_kthread_timer(unsigned long arg)
1217{
1218 invoke_rcu_node_kthread((struct rcu_node *)arg);
1206} 1219}
1207 1220
1208/* 1221/*
@@ -1215,12 +1228,9 @@ static int rcu_boost_kthread(void *arg)
1215 int spincnt = 0; 1228 int spincnt = 0;
1216 int more2boost; 1229 int more2boost;
1217 1230
1218 trace_rcu_utilization("Start boost kthread@init");
1219 for (;;) { 1231 for (;;) {
1220 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1221 trace_rcu_utilization("End boost kthread@rcu_wait");
1222 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1223 trace_rcu_utilization("Start boost kthread@rcu_wait");
1224 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1225 more2boost = rcu_boost(rnp); 1235 more2boost = rcu_boost(rnp);
1226 if (more2boost) 1236 if (more2boost)
@@ -1228,15 +1238,11 @@ static int rcu_boost_kthread(void *arg)
1228 else 1238 else
1229 spincnt = 0; 1239 spincnt = 0;
1230 if (spincnt > 10) { 1240 if (spincnt > 10) {
1231 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1232 trace_rcu_utilization("End boost kthread@rcu_yield");
1233 schedule_timeout_interruptible(2);
1234 trace_rcu_utilization("Start boost kthread@rcu_yield");
1235 spincnt = 0; 1242 spincnt = 0;
1236 } 1243 }
1237 } 1244 }
1238 /* NOTREACHED */ 1245 /* NOTREACHED */
1239 trace_rcu_utilization("End boost kthread@notreached");
1240 return 0; 1246 return 0;
1241} 1247}
1242 1248
@@ -1246,9 +1252,9 @@ static int rcu_boost_kthread(void *arg)
1246 * kthread to start boosting them. If there is an expedited grace 1252 * kthread to start boosting them. If there is an expedited grace
1247 * period in progress, it is always time to boost. 1253 * period in progress, it is always time to boost.
1248 * 1254 *
1249 * The caller must hold rnp->lock, which this function releases. 1255 * The caller must hold rnp->lock, which this function releases,
1250 * The ->boost_kthread_task is immortal, so we don't need to worry 1256 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1251 * about it going away. 1257 * so we don't need to worry about it going away.
1252 */ 1258 */
1253static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1259static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1254{ 1260{
@@ -1268,8 +1274,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1268 rnp->boost_tasks = rnp->gp_tasks; 1274 rnp->boost_tasks = rnp->gp_tasks;
1269 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1275 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1270 t = rnp->boost_kthread_task; 1276 t = rnp->boost_kthread_task;
1271 if (t) 1277 if (t != NULL)
1272 rcu_wake_cond(t, rnp->boost_kthread_status); 1278 wake_up_process(t);
1273 } else { 1279 } else {
1274 rcu_initiate_boost_trace(rnp); 1280 rcu_initiate_boost_trace(rnp);
1275 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1281 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1285,21 +1291,27 @@ static void invoke_rcu_callbacks_kthread(void)
1285 1291
1286 local_irq_save(flags); 1292 local_irq_save(flags);
1287 __this_cpu_write(rcu_cpu_has_work, 1); 1293 __this_cpu_write(rcu_cpu_has_work, 1);
1288 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1289 current != __this_cpu_read(rcu_cpu_kthread_task)) { 1295 local_irq_restore(flags);
1290 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), 1296 return;
1291 __this_cpu_read(rcu_cpu_kthread_status));
1292 } 1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1293 local_irq_restore(flags); 1299 local_irq_restore(flags);
1294} 1300}
1295 1301
1296/* 1302/*
1297 * Is the current CPU running the RCU-callbacks kthread? 1303 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1298 * Caller must have preemption disabled. 1304 * held, so no one should be messing with the existence of the boost
1305 * kthread.
1299 */ 1306 */
1300static bool rcu_is_callbacks_kthread(void) 1307static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1308 cpumask_var_t cm)
1301{ 1309{
1302 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1310 struct task_struct *t;
1311
1312 t = rnp->boost_kthread_task;
1313 if (t != NULL)
1314 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1303} 1315}
1304 1316
1305#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1317#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1318,35 +1330,50 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1318 * Returns zero if all is well, a negated errno otherwise. 1330 * Returns zero if all is well, a negated errno otherwise.
1319 */ 1331 */
1320static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1332static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1321 struct rcu_node *rnp) 1333 struct rcu_node *rnp,
1334 int rnp_index)
1322{ 1335{
1323 int rnp_index = rnp - &rsp->node[0];
1324 unsigned long flags; 1336 unsigned long flags;
1325 struct sched_param sp; 1337 struct sched_param sp;
1326 struct task_struct *t; 1338 struct task_struct *t;
1327 1339
1328 if (&rcu_preempt_state != rsp) 1340 if (&rcu_preempt_state != rsp)
1329 return 0; 1341 return 0;
1330
1331 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1332 return 0;
1333
1334 rsp->boost = 1; 1342 rsp->boost = 1;
1335 if (rnp->boost_kthread_task != NULL) 1343 if (rnp->boost_kthread_task != NULL)
1336 return 0; 1344 return 0;
1337 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1345 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1338 "rcub/%d", rnp_index); 1346 "rcub%d", rnp_index);
1339 if (IS_ERR(t)) 1347 if (IS_ERR(t))
1340 return PTR_ERR(t); 1348 return PTR_ERR(t);
1341 raw_spin_lock_irqsave(&rnp->lock, flags); 1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1342 rnp->boost_kthread_task = t; 1350 rnp->boost_kthread_task = t;
1343 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1351 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1344 sp.sched_priority = RCU_BOOST_PRIO; 1352 sp.sched_priority = RCU_KTHREAD_PRIO;
1345 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1346 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1347 return 0; 1355 return 0;
1348} 1356}
1349 1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360/*
1361 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1362 */
1363static void rcu_stop_cpu_kthread(int cpu)
1364{
1365 struct task_struct *t;
1366
1367 /* Stop the CPU's kthread. */
1368 t = per_cpu(rcu_cpu_kthread_task, cpu);
1369 if (t != NULL) {
1370 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1371 kthread_stop(t);
1372 }
1373}
1374
1375#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1376
1350static void rcu_kthread_do_work(void) 1377static void rcu_kthread_do_work(void)
1351{ 1378{
1352 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1379 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1354,58 +1381,239 @@ static void rcu_kthread_do_work(void)
1354 rcu_preempt_do_callbacks(); 1381 rcu_preempt_do_callbacks();
1355} 1382}
1356 1383
1357static void rcu_cpu_kthread_setup(unsigned int cpu) 1384/*
1385 * Wake up the specified per-rcu_node-structure kthread.
1386 * Because the per-rcu_node kthreads are immortal, we don't need
1387 * to do anything to keep them alive.
1388 */
1389static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1390{
1391 struct task_struct *t;
1392
1393 t = rnp->node_kthread_task;
1394 if (t != NULL)
1395 wake_up_process(t);
1396}
1397
1398/*
1399 * Set the specified CPU's kthread to run RT or not, as specified by
1400 * the to_rt argument. The CPU-hotplug locks are held, so the task
1401 * is not going away.
1402 */
1403static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1358{ 1404{
1405 int policy;
1359 struct sched_param sp; 1406 struct sched_param sp;
1407 struct task_struct *t;
1360 1408
1361 sp.sched_priority = RCU_KTHREAD_PRIO; 1409 t = per_cpu(rcu_cpu_kthread_task, cpu);
1362 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1410 if (t == NULL)
1411 return;
1412 if (to_rt) {
1413 policy = SCHED_FIFO;
1414 sp.sched_priority = RCU_KTHREAD_PRIO;
1415 } else {
1416 policy = SCHED_NORMAL;
1417 sp.sched_priority = 0;
1418 }
1419 sched_setscheduler_nocheck(t, policy, &sp);
1363} 1420}
1364 1421
1365static void rcu_cpu_kthread_park(unsigned int cpu) 1422/*
1423 * Timer handler to initiate the waking up of per-CPU kthreads that
1424 * have yielded the CPU due to excess numbers of RCU callbacks.
1425 * We wake up the per-rcu_node kthread, which in turn will wake up
1426 * the booster kthread.
1427 */
1428static void rcu_cpu_kthread_timer(unsigned long arg)
1366{ 1429{
1367 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 1430 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1431 struct rcu_node *rnp = rdp->mynode;
1432
1433 atomic_or(rdp->grpmask, &rnp->wakemask);
1434 invoke_rcu_node_kthread(rnp);
1368} 1435}
1369 1436
1370static int rcu_cpu_kthread_should_run(unsigned int cpu) 1437/*
1438 * Drop to non-real-time priority and yield, but only after posting a
1439 * timer that will cause us to regain our real-time priority if we
1440 * remain preempted. Either way, we restore our real-time priority
1441 * before returning.
1442 */
1443static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1371{ 1444{
1372 return __get_cpu_var(rcu_cpu_has_work); 1445 struct sched_param sp;
1446 struct timer_list yield_timer;
1447
1448 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2);
1450 sp.sched_priority = 0;
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19);
1453 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer);
1457}
1458
1459/*
1460 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1461 * This can happen while the corresponding CPU is either coming online
1462 * or going offline. We cannot wait until the CPU is fully online
1463 * before starting the kthread, because the various notifier functions
1464 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1465 * the corresponding CPU is online.
1466 *
1467 * Return 1 if the kthread needs to stop, 0 otherwise.
1468 *
1469 * Caller must disable bh. This function can momentarily enable it.
1470 */
1471static int rcu_cpu_kthread_should_stop(int cpu)
1472{
1473 while (cpu_is_offline(cpu) ||
1474 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1475 smp_processor_id() != cpu) {
1476 if (kthread_should_stop())
1477 return 1;
1478 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1479 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1480 local_bh_enable();
1481 schedule_timeout_uninterruptible(1);
1482 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1483 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1484 local_bh_disable();
1485 }
1486 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1487 return 0;
1373} 1488}
1374 1489
1375/* 1490/*
1376 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1377 * RCU softirq used in flavors and configurations of RCU that do not 1492 * earlier RCU softirq.
1378 * support RCU priority boosting.
1379 */ 1493 */
1380static void rcu_cpu_kthread(unsigned int cpu) 1494static int rcu_cpu_kthread(void *arg)
1381{ 1495{
1382 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); 1496 int cpu = (int)(long)arg;
1383 char work, *workp = &__get_cpu_var(rcu_cpu_has_work); 1497 unsigned long flags;
1384 int spincnt; 1498 int spincnt = 0;
1499 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1500 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1385 1502
1386 for (spincnt = 0; spincnt < 10; spincnt++) { 1503 for (;;) {
1387 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1504 *statusp = RCU_KTHREAD_WAITING;
1505 rcu_wait(*workp != 0 || kthread_should_stop());
1388 local_bh_disable(); 1506 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable();
1509 break;
1510 }
1389 *statusp = RCU_KTHREAD_RUNNING; 1511 *statusp = RCU_KTHREAD_RUNNING;
1390 this_cpu_inc(rcu_cpu_kthread_loops); 1512 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1391 local_irq_disable(); 1513 local_irq_save(flags);
1392 work = *workp; 1514 work = *workp;
1393 *workp = 0; 1515 *workp = 0;
1394 local_irq_enable(); 1516 local_irq_restore(flags);
1395 if (work) 1517 if (work)
1396 rcu_kthread_do_work(); 1518 rcu_kthread_do_work();
1397 local_bh_enable(); 1519 local_bh_enable();
1398 if (*workp == 0) { 1520 if (*workp != 0)
1399 trace_rcu_utilization("End CPU kthread@rcu_wait"); 1521 spincnt++;
1400 *statusp = RCU_KTHREAD_WAITING; 1522 else
1401 return; 1523 spincnt = 0;
1524 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING;
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1527 spincnt = 0;
1402 } 1528 }
1403 } 1529 }
1404 *statusp = RCU_KTHREAD_YIELDING; 1530 *statusp = RCU_KTHREAD_STOPPED;
1405 trace_rcu_utilization("Start CPU kthread@rcu_yield"); 1531 return 0;
1406 schedule_timeout_interruptible(2); 1532}
1407 trace_rcu_utilization("End CPU kthread@rcu_yield"); 1533
1408 *statusp = RCU_KTHREAD_WAITING; 1534/*
1535 * Spawn a per-CPU kthread, setting up affinity and priority.
1536 * Because the CPU hotplug lock is held, no other CPU will be attempting
1537 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1538 * attempting to access it during boot, but the locking in kthread_bind()
1539 * will enforce sufficient ordering.
1540 *
1541 * Please note that we cannot simply refuse to wake up the per-CPU
1542 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1543 * which can result in softlockup complaints if the task ends up being
1544 * idle for more than a couple of minutes.
1545 *
1546 * However, please note also that we cannot bind the per-CPU kthread to its
1547 * CPU until that CPU is fully online. We also cannot wait until the
1548 * CPU is fully online before we create its per-CPU kthread, as this would
1549 * deadlock the system when CPU notifiers tried waiting for grace
1550 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1551 * is online. If its CPU is not yet fully online, then the code in
1552 * rcu_cpu_kthread() will wait until it is fully online, and then do
1553 * the binding.
1554 */
1555static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1556{
1557 struct sched_param sp;
1558 struct task_struct *t;
1559
1560 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1564 if (IS_ERR(t))
1565 return PTR_ERR(t);
1566 if (cpu_online(cpu))
1567 kthread_bind(t, cpu);
1568 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1569 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1570 sp.sched_priority = RCU_KTHREAD_PRIO;
1571 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1572 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1573 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1574 return 0;
1575}
1576
1577/*
1578 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1579 * kthreads when needed. We ignore requests to wake up kthreads
1580 * for offline CPUs, which is OK because force_quiescent_state()
1581 * takes care of this case.
1582 */
1583static int rcu_node_kthread(void *arg)
1584{
1585 int cpu;
1586 unsigned long flags;
1587 unsigned long mask;
1588 struct rcu_node *rnp = (struct rcu_node *)arg;
1589 struct sched_param sp;
1590 struct task_struct *t;
1591
1592 for (;;) {
1593 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1594 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1595 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1596 raw_spin_lock_irqsave(&rnp->lock, flags);
1597 mask = atomic_xchg(&rnp->wakemask, 0);
1598 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1599 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1600 if ((mask & 0x1) == 0)
1601 continue;
1602 preempt_disable();
1603 t = per_cpu(rcu_cpu_kthread_task, cpu);
1604 if (!cpu_online(cpu) || t == NULL) {
1605 preempt_enable();
1606 continue;
1607 }
1608 per_cpu(rcu_cpu_has_work, cpu) = 1;
1609 sp.sched_priority = RCU_KTHREAD_PRIO;
1610 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1611 preempt_enable();
1612 }
1613 }
1614 /* NOTREACHED */
1615 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1616 return 0;
1409} 1617}
1410 1618
1411/* 1619/*
@@ -1417,17 +1625,17 @@ static void rcu_cpu_kthread(unsigned int cpu)
1417 * no outgoing CPU. If there are no CPUs left in the affinity set, 1625 * no outgoing CPU. If there are no CPUs left in the affinity set,
1418 * this function allows the kthread to execute on any CPU. 1626 * this function allows the kthread to execute on any CPU.
1419 */ 1627 */
1420static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1628static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1421{ 1629{
1422 struct task_struct *t = rnp->boost_kthread_task;
1423 unsigned long mask = rnp->qsmaskinit;
1424 cpumask_var_t cm; 1630 cpumask_var_t cm;
1425 int cpu; 1631 int cpu;
1632 unsigned long mask = rnp->qsmaskinit;
1426 1633
1427 if (!t) 1634 if (rnp->node_kthread_task == NULL)
1428 return; 1635 return;
1429 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1636 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1430 return; 1637 return;
1638 cpumask_clear(cm);
1431 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1639 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1432 if ((mask & 0x1) && cpu != outgoingcpu) 1640 if ((mask & 0x1) && cpu != outgoingcpu)
1433 cpumask_set_cpu(cpu, cm); 1641 cpumask_set_cpu(cpu, cm);
@@ -1437,36 +1645,62 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1437 cpumask_clear_cpu(cpu, cm); 1645 cpumask_clear_cpu(cpu, cm);
1438 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1646 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1439 } 1647 }
1440 set_cpus_allowed_ptr(t, cm); 1648 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1649 rcu_boost_kthread_setaffinity(rnp, cm);
1441 free_cpumask_var(cm); 1650 free_cpumask_var(cm);
1442} 1651}
1443 1652
1444static struct smp_hotplug_thread rcu_cpu_thread_spec = { 1653/*
1445 .store = &rcu_cpu_kthread_task, 1654 * Spawn a per-rcu_node kthread, setting priority and affinity.
1446 .thread_should_run = rcu_cpu_kthread_should_run, 1655 * Called during boot before online/offline can happen, or, if
1447 .thread_fn = rcu_cpu_kthread, 1656 * during runtime, with the main CPU-hotplug locks held. So only
1448 .thread_comm = "rcuc/%u", 1657 * one of these can be executing at a time.
1449 .setup = rcu_cpu_kthread_setup, 1658 */
1450 .park = rcu_cpu_kthread_park, 1659static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1451}; 1660 struct rcu_node *rnp)
1661{
1662 unsigned long flags;
1663 int rnp_index = rnp - &rsp->node[0];
1664 struct sched_param sp;
1665 struct task_struct *t;
1666
1667 if (!rcu_scheduler_fully_active ||
1668 rnp->qsmaskinit == 0)
1669 return 0;
1670 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index);
1673 if (IS_ERR(t))
1674 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags);
1676 rnp->node_kthread_task = t;
1677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1678 sp.sched_priority = 99;
1679 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1681 }
1682 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1683}
1452 1684
1453/* 1685/*
1454 * Spawn all kthreads -- called as soon as the scheduler is running. 1686 * Spawn all kthreads -- called as soon as the scheduler is running.
1455 */ 1687 */
1456static int __init rcu_spawn_kthreads(void) 1688static int __init rcu_spawn_kthreads(void)
1457{ 1689{
1458 struct rcu_node *rnp;
1459 int cpu; 1690 int cpu;
1691 struct rcu_node *rnp;
1460 1692
1461 rcu_scheduler_fully_active = 1; 1693 rcu_scheduler_fully_active = 1;
1462 for_each_possible_cpu(cpu) 1694 for_each_possible_cpu(cpu) {
1463 per_cpu(rcu_cpu_has_work, cpu) = 0; 1695 per_cpu(rcu_cpu_has_work, cpu) = 0;
1464 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1696 if (cpu_online(cpu))
1697 (void)rcu_spawn_one_cpu_kthread(cpu);
1698 }
1465 rnp = rcu_get_root(rcu_state); 1699 rnp = rcu_get_root(rcu_state);
1466 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1700 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1467 if (NUM_RCU_NODES > 1) { 1701 if (NUM_RCU_NODES > 1) {
1468 rcu_for_each_leaf_node(rcu_state, rnp) 1702 rcu_for_each_leaf_node(rcu_state, rnp)
1469 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1703 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1470 } 1704 }
1471 return 0; 1705 return 0;
1472} 1706}
@@ -1478,8 +1712,11 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1478 struct rcu_node *rnp = rdp->mynode; 1712 struct rcu_node *rnp = rdp->mynode;
1479 1713
1480 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1714 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1481 if (rcu_scheduler_fully_active) 1715 if (rcu_scheduler_fully_active) {
1482 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1716 (void)rcu_spawn_one_cpu_kthread(cpu);
1717 if (rnp->node_kthread_task == NULL)
1718 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1719 }
1483} 1720}
1484 1721
1485#else /* #ifdef CONFIG_RCU_BOOST */ 1722#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1494,16 +1731,23 @@ static void invoke_rcu_callbacks_kthread(void)
1494 WARN_ON_ONCE(1); 1731 WARN_ON_ONCE(1);
1495} 1732}
1496 1733
1497static bool rcu_is_callbacks_kthread(void) 1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1498{ 1735{
1499 return false;
1500} 1736}
1501 1737
1502static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1738#ifdef CONFIG_HOTPLUG_CPU
1739
1740static void rcu_stop_cpu_kthread(int cpu)
1741{
1742}
1743
1744#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1745
1746static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1503{ 1747{
1504} 1748}
1505 1749
1506static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1750static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1507{ 1751{
1508} 1752}
1509 1753
@@ -1520,978 +1764,247 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1520 1764
1521#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1765#endif /* #else #ifdef CONFIG_RCU_BOOST */
1522 1766
1523#if !defined(CONFIG_RCU_FAST_NO_HZ) 1767#ifndef CONFIG_SMP
1524
1525/*
1526 * Check to see if any future RCU-related work will need to be done
1527 * by the current CPU, even if none need be done immediately, returning
1528 * 1 if so. This function is part of the RCU implementation; it is -not-
1529 * an exported member of the RCU API.
1530 *
1531 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1532 * any flavor of RCU.
1533 */
1534int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1535{
1536 *delta_jiffies = ULONG_MAX;
1537 return rcu_cpu_has_callbacks(cpu);
1538}
1539 1768
1540/* 1769void synchronize_sched_expedited(void)
1541 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1542 */
1543static void rcu_prepare_for_idle_init(int cpu)
1544{ 1770{
1771 cond_resched();
1545} 1772}
1773EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1546 1774
1547/* 1775#else /* #ifndef CONFIG_SMP */
1548 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1549 * after it.
1550 */
1551static void rcu_cleanup_after_idle(int cpu)
1552{
1553}
1554 1776
1555/* 1777static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1556 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1778static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1557 * is nothing.
1558 */
1559static void rcu_prepare_for_idle(int cpu)
1560{
1561}
1562 1779
1563/* 1780static int synchronize_sched_expedited_cpu_stop(void *data)
1564 * Don't bother keeping a running count of the number of RCU callbacks
1565 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1566 */
1567static void rcu_idle_count_callbacks_posted(void)
1568{ 1781{
1782 /*
1783 * There must be a full memory barrier on each affected CPU
1784 * between the time that try_stop_cpus() is called and the
1785 * time that it returns.
1786 *
1787 * In the current initial implementation of cpu_stop, the
1788 * above condition is already met when the control reaches
1789 * this point and the following smp_mb() is not strictly
1790 * necessary. Do smp_mb() anyway for documentation and
1791 * robustness against future implementation changes.
1792 */
1793 smp_mb(); /* See above comment block. */
1794 return 0;
1569} 1795}
1570 1796
1571#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1572
1573/* 1797/*
1574 * This code is invoked when a CPU goes idle, at which point we want 1798 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1575 * to have the CPU do everything required for RCU so that it can enter 1799 * approach to force grace period to end quickly. This consumes
1576 * the energy-efficient dyntick-idle mode. This is handled by a 1800 * significant time on all CPUs, and is thus not recommended for
1577 * state machine implemented by rcu_prepare_for_idle() below. 1801 * any sort of common-case code.
1578 * 1802 *
1579 * The following three proprocessor symbols control this state machine: 1803 * Note that it is illegal to call this function while holding any
1804 * lock that is acquired by a CPU-hotplug notifier. Failing to
1805 * observe this restriction will result in deadlock.
1580 * 1806 *
1581 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt 1807 * This implementation can be thought of as an application of ticket
1582 * to satisfy RCU. Beyond this point, it is better to incur a periodic 1808 * locking to RCU, with sync_sched_expedited_started and
1583 * scheduling-clock interrupt than to loop through the state machine 1809 * sync_sched_expedited_done taking on the roles of the halves
1584 * at full power. 1810 * of the ticket-lock word. Each task atomically increments
1585 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are 1811 * sync_sched_expedited_started upon entry, snapshotting the old value,
1586 * optional if RCU does not need anything immediately from this 1812 * then attempts to stop all the CPUs. If this succeeds, then each
1587 * CPU, even if this CPU still has RCU callbacks queued. The first 1813 * CPU will have executed a context switch, resulting in an RCU-sched
1588 * times through the state machine are mandatory: we need to give 1814 * grace period. We are then done, so we use atomic_cmpxchg() to
1589 * the state machine a chance to communicate a quiescent state 1815 * update sync_sched_expedited_done to match our snapshot -- but
1590 * to the RCU core. 1816 * only if someone else has not already advanced past our snapshot.
1591 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1592 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1593 * is sized to be roughly one RCU grace period. Those energy-efficiency
1594 * benchmarkers who might otherwise be tempted to set this to a large
1595 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1596 * system. And if you are -that- concerned about energy efficiency,
1597 * just power the system down and be done with it!
1598 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1599 * permitted to sleep in dyntick-idle mode with only lazy RCU
1600 * callbacks pending. Setting this too high can OOM your system.
1601 * 1817 *
1602 * The values below work well in practice. If future workloads require 1818 * On the other hand, if try_stop_cpus() fails, we check the value
1603 * adjustment, they can be converted into kernel config parameters, though 1819 * of sync_sched_expedited_done. If it has advanced past our
1604 * making the state machine smarter might be a better option. 1820 * initial snapshot, then someone else must have forced a grace period
1605 */ 1821 * some time after we took our snapshot. In this case, our work is
1606#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1822 * done for us, and we can simply return. Otherwise, we try again,
1607#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1823 * but keep our initial snapshot for purposes of checking for someone
1608#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1824 * doing our work for us.
1609#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1825 *
1610 1826 * If we fail too many times in a row, we fall back to synchronize_sched().
1611extern int tick_nohz_enabled;
1612
1613/*
1614 * Does the specified flavor of RCU have non-lazy callbacks pending on
1615 * the specified CPU? Both RCU flavor and CPU are specified by the
1616 * rcu_data structure.
1617 */ 1827 */
1618static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) 1828void synchronize_sched_expedited(void)
1619{ 1829{
1620 return rdp->qlen != rdp->qlen_lazy; 1830 int firstsnap, s, snap, trycount = 0;
1621}
1622 1831
1623#ifdef CONFIG_TREE_PREEMPT_RCU 1832 /* Note that atomic_inc_return() implies full memory barrier. */
1833 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1834 get_online_cpus();
1624 1835
1625/* 1836 /*
1626 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there 1837 * Each pass through the following loop attempts to force a
1627 * is no RCU-preempt in the kernel.) 1838 * context switch on each CPU.
1628 */ 1839 */
1629static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1840 while (try_stop_cpus(cpu_online_mask,
1630{ 1841 synchronize_sched_expedited_cpu_stop,
1631 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 1842 NULL) == -EAGAIN) {
1843 put_online_cpus();
1632 1844
1633 return __rcu_cpu_has_nonlazy_callbacks(rdp); 1845 /* No joy, try again later. Or just synchronize_sched(). */
1634} 1846 if (trycount++ < 10)
1847 udelay(trycount * num_online_cpus());
1848 else {
1849 synchronize_sched();
1850 return;
1851 }
1635 1852
1636#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1853 /* Check to see if someone else did our work for us. */
1854 s = atomic_read(&sync_sched_expedited_done);
1855 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1856 smp_mb(); /* ensure test happens before caller kfree */
1857 return;
1858 }
1637 1859
1638static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1860 /*
1639{ 1861 * Refetching sync_sched_expedited_started allows later
1640 return 0; 1862 * callers to piggyback on our grace period. We subtract
1641} 1863 * 1 to get the same token that the last incrementer got.
1864 * We retry after they started, so our grace period works
1865 * for them, and they started after our first try, so their
1866 * grace period works for us.
1867 */
1868 get_online_cpus();
1869 snap = atomic_read(&sync_sched_expedited_started) - 1;
1870 smp_mb(); /* ensure read is before try_stop_cpus(). */
1871 }
1642 1872
1643#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1873 /*
1874 * Everyone up to our most recent fetch is covered by our grace
1875 * period. Update the counter, but only if our work is still
1876 * relevant -- which it won't be if someone who started later
1877 * than we did beat us to the punch.
1878 */
1879 do {
1880 s = atomic_read(&sync_sched_expedited_done);
1881 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1882 smp_mb(); /* ensure test happens before caller kfree */
1883 break;
1884 }
1885 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1644 1886
1645/* 1887 put_online_cpus();
1646 * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
1647 */
1648static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
1649{
1650 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1651 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1652 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1653} 1888}
1889EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1654 1890
1655/* 1891#endif /* #else #ifndef CONFIG_SMP */
1656 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1657 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1658 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1659 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
1660 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1661 * it is better to incur scheduling-clock interrupts than to spin
1662 * continuously for the same time duration!
1663 *
1664 * The delta_jiffies argument is used to store the time when RCU is
1665 * going to need the CPU again if it still has callbacks. The reason
1666 * for this is that rcu_prepare_for_idle() might need to post a timer,
1667 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1668 * the wakeup time for this CPU. This means that RCU's timer can be
1669 * delayed until the wakeup time, which defeats the purpose of posting
1670 * a timer.
1671 */
1672int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1673{
1674 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1675
1676 /* Flag a new idle sojourn to the idle-entry state machine. */
1677 rdtp->idle_first_pass = 1;
1678 /* If no callbacks, RCU doesn't need the CPU. */
1679 if (!rcu_cpu_has_callbacks(cpu)) {
1680 *delta_jiffies = ULONG_MAX;
1681 return 0;
1682 }
1683 if (rdtp->dyntick_holdoff == jiffies) {
1684 /* RCU recently tried and failed, so don't try again. */
1685 *delta_jiffies = 1;
1686 return 1;
1687 }
1688 /* Set up for the possibility that RCU will post a timer. */
1689 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1690 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
1691 RCU_IDLE_GP_DELAY) - jiffies;
1692 } else {
1693 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1694 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1695 }
1696 return 0;
1697}
1698 1892
1699/* 1893#if !defined(CONFIG_RCU_FAST_NO_HZ)
1700 * Handler for smp_call_function_single(). The only point of this
1701 * handler is to wake the CPU up, so the handler does only tracing.
1702 */
1703void rcu_idle_demigrate(void *unused)
1704{
1705 trace_rcu_prep_idle("Demigrate");
1706}
1707 1894
1708/* 1895/*
1709 * Timer handler used to force CPU to start pushing its remaining RCU 1896 * Check to see if any future RCU-related work will need to be done
1710 * callbacks in the case where it entered dyntick-idle mode with callbacks 1897 * by the current CPU, even if none need be done immediately, returning
1711 * pending. The hander doesn't really need to do anything because the 1898 * 1 if so. This function is part of the RCU implementation; it is -not-
1712 * real work is done upon re-entry to idle, or by the next scheduling-clock 1899 * an exported member of the RCU API.
1713 * interrupt should idle not be re-entered.
1714 * 1900 *
1715 * One special case: the timer gets migrated without awakening the CPU 1901 * Because we have preemptible RCU, just check whether this CPU needs
1716 * on which the timer was scheduled on. In this case, we must wake up 1902 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
1717 * that CPU. We do so with smp_call_function_single(). 1903 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1718 */ 1904 */
1719static void rcu_idle_gp_timer_func(unsigned long cpu_in) 1905int rcu_needs_cpu(int cpu)
1720{ 1906{
1721 int cpu = (int)cpu_in; 1907 return rcu_needs_cpu_quick_check(cpu);
1722
1723 trace_rcu_prep_idle("Timer");
1724 if (cpu != smp_processor_id())
1725 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1726 else
1727 WARN_ON_ONCE(1); /* Getting here can hang the system... */
1728} 1908}
1729 1909
1730/* 1910/*
1731 * Initialize the timer used to pull CPUs out of dyntick-idle mode. 1911 * Check to see if we need to continue a callback-flush operations to
1912 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
1913 * entry is not configured, so we never do need to.
1732 */ 1914 */
1733static void rcu_prepare_for_idle_init(int cpu) 1915static void rcu_needs_cpu_flush(void)
1734{ 1916{
1735 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1736
1737 rdtp->dyntick_holdoff = jiffies - 1;
1738 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1739 rdtp->idle_gp_timer_expires = jiffies - 1;
1740 rdtp->idle_first_pass = 1;
1741} 1917}
1742 1918
1743/* 1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1744 * Clean up for exit from idle. Because we are exiting from idle, there
1745 * is no longer any point to ->idle_gp_timer, so cancel it. This will
1746 * do nothing if this timer is not active, so just cancel it unconditionally.
1747 */
1748static void rcu_cleanup_after_idle(int cpu)
1749{
1750 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1751 1920
1752 del_timer(&rdtp->idle_gp_timer); 1921#define RCU_NEEDS_CPU_FLUSHES 5
1753 trace_rcu_prep_idle("Cleanup after idle"); 1922static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1754 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); 1923static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1755}
1756 1924
1757/* 1925/*
1758 * Check to see if any RCU-related work can be done by the current CPU, 1926 * Check to see if any future RCU-related work will need to be done
1759 * and if so, schedule a softirq to get it done. This function is part 1927 * by the current CPU, even if none need be done immediately, returning
1760 * of the RCU implementation; it is -not- an exported member of the RCU API. 1928 * 1 if so. This function is part of the RCU implementation; it is -not-
1929 * an exported member of the RCU API.
1761 * 1930 *
1762 * The idea is for the current CPU to clear out all work required by the 1931 * Because we are not supporting preemptible RCU, attempt to accelerate
1763 * RCU core for the current grace period, so that this CPU can be permitted 1932 * any current grace periods so that RCU no longer needs this CPU, but
1764 * to enter dyntick-idle mode. In some cases, it will need to be awakened 1933 * only if all other CPUs are already in dynticks-idle mode. This will
1765 * at the end of the grace period by whatever CPU ends the grace period. 1934 * allow the CPU cores to be powered down immediately, as opposed to after
1766 * This allows CPUs to go dyntick-idle more quickly, and to reduce the 1935 * waiting many milliseconds for grace periods to elapse.
1767 * number of wakeups by a modest integer factor.
1768 * 1936 *
1769 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1770 * disabled, we do one pass of force_quiescent_state(), then do a 1938 * disabled, we do one pass of force_quiescent_state(), then do a
1771 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1772 * later. The ->dyntick_drain field controls the sequencing. 1940 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1773 *
1774 * The caller must have disabled interrupts.
1775 */ 1941 */
1776static void rcu_prepare_for_idle(int cpu) 1942int rcu_needs_cpu(int cpu)
1777{ 1943{
1778 struct timer_list *tp; 1944 int c = 0;
1779 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1945 int snap;
1780 int tne; 1946 int thatcpu;
1781 1947
1782 /* Handle nohz enablement switches conservatively. */ 1948 /* Check for being in the holdoff period. */
1783 tne = ACCESS_ONCE(tick_nohz_enabled); 1949 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1784 if (tne != rdtp->tick_nohz_enabled_snap) { 1950 return rcu_needs_cpu_quick_check(cpu);
1785 if (rcu_cpu_has_callbacks(cpu))
1786 invoke_rcu_core(); /* force nohz to see update. */
1787 rdtp->tick_nohz_enabled_snap = tne;
1788 return;
1789 }
1790 if (!tne)
1791 return;
1792 1951
1793 /* Adaptive-tick mode, where usermode execution is idle to RCU. */ 1952 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1794 if (!is_idle_task(current)) { 1953 for_each_online_cpu(thatcpu) {
1795 rdtp->dyntick_holdoff = jiffies - 1; 1954 if (thatcpu == cpu)
1796 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 1955 continue;
1797 trace_rcu_prep_idle("User dyntick with callbacks"); 1956 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1798 rdtp->idle_gp_timer_expires = 1957 thatcpu).dynticks);
1799 round_up(jiffies + RCU_IDLE_GP_DELAY, 1958 smp_mb(); /* Order sampling of snap with end of grace period. */
1800 RCU_IDLE_GP_DELAY); 1959 if ((snap & 0x1) != 0) {
1801 } else if (rcu_cpu_has_callbacks(cpu)) { 1960 per_cpu(rcu_dyntick_drain, cpu) = 0;
1802 rdtp->idle_gp_timer_expires = 1961 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1803 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); 1962 return rcu_needs_cpu_quick_check(cpu);
1804 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1805 } else {
1806 return;
1807 }
1808 tp = &rdtp->idle_gp_timer;
1809 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1810 return;
1811 }
1812
1813 /*
1814 * If this is an idle re-entry, for example, due to use of
1815 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
1816 * loop, then don't take any state-machine actions, unless the
1817 * momentary exit from idle queued additional non-lazy callbacks.
1818 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1819 * pending.
1820 */
1821 if (!rdtp->idle_first_pass &&
1822 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
1823 if (rcu_cpu_has_callbacks(cpu)) {
1824 tp = &rdtp->idle_gp_timer;
1825 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1826 } 1963 }
1827 return;
1828 } 1964 }
1829 rdtp->idle_first_pass = 0;
1830 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1831 1965
1832 /* 1966 /* Check and update the rcu_dyntick_drain sequencing. */
1833 * If there are no callbacks on this CPU, enter dyntick-idle mode. 1967 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1834 * Also reset state to avoid prejudicing later attempts.
1835 */
1836 if (!rcu_cpu_has_callbacks(cpu)) {
1837 rdtp->dyntick_holdoff = jiffies - 1;
1838 rdtp->dyntick_drain = 0;
1839 trace_rcu_prep_idle("No callbacks");
1840 return;
1841 }
1842
1843 /*
1844 * If in holdoff mode, just return. We will presumably have
1845 * refrained from disabling the scheduling-clock tick.
1846 */
1847 if (rdtp->dyntick_holdoff == jiffies) {
1848 trace_rcu_prep_idle("In holdoff");
1849 return;
1850 }
1851
1852 /* Check and update the ->dyntick_drain sequencing. */
1853 if (rdtp->dyntick_drain <= 0) {
1854 /* First time through, initialize the counter. */ 1968 /* First time through, initialize the counter. */
1855 rdtp->dyntick_drain = RCU_IDLE_FLUSHES; 1969 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1856 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && 1970 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1857 !rcu_pending(cpu) &&
1858 !local_softirq_pending()) {
1859 /* Can we go dyntick-idle despite still having callbacks? */
1860 rdtp->dyntick_drain = 0;
1861 rdtp->dyntick_holdoff = jiffies;
1862 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1863 trace_rcu_prep_idle("Dyntick with callbacks");
1864 rdtp->idle_gp_timer_expires =
1865 round_up(jiffies + RCU_IDLE_GP_DELAY,
1866 RCU_IDLE_GP_DELAY);
1867 } else {
1868 rdtp->idle_gp_timer_expires =
1869 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1870 trace_rcu_prep_idle("Dyntick with lazy callbacks");
1871 }
1872 tp = &rdtp->idle_gp_timer;
1873 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1874 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1875 return; /* Nothing more to do immediately. */
1876 } else if (--(rdtp->dyntick_drain) <= 0) {
1877 /* We have hit the limit, so time to give up. */ 1971 /* We have hit the limit, so time to give up. */
1878 rdtp->dyntick_holdoff = jiffies; 1972 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1879 trace_rcu_prep_idle("Begin holdoff"); 1973 return rcu_needs_cpu_quick_check(cpu);
1880 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
1881 return;
1882 } 1974 }
1883 1975
1884 /* 1976 /* Do one step pushing remaining RCU callbacks through. */
1885 * Do one step of pushing the remaining RCU callbacks through
1886 * the RCU core state machine.
1887 */
1888#ifdef CONFIG_TREE_PREEMPT_RCU
1889 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1890 rcu_preempt_qs(cpu);
1891 force_quiescent_state(&rcu_preempt_state);
1892 }
1893#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1894 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 1977 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1895 rcu_sched_qs(cpu); 1978 rcu_sched_qs(cpu);
1896 force_quiescent_state(&rcu_sched_state); 1979 force_quiescent_state(&rcu_sched_state, 0);
1980 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1897 } 1981 }
1898 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 1982 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1899 rcu_bh_qs(cpu); 1983 rcu_bh_qs(cpu);
1900 force_quiescent_state(&rcu_bh_state); 1984 force_quiescent_state(&rcu_bh_state, 0);
1985 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1901 } 1986 }
1902 1987
1903 /* 1988 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1904 * If RCU callbacks are still pending, RCU still needs this CPU. 1989 if (c)
1905 * So try forcing the callbacks through the grace period.
1906 */
1907 if (rcu_cpu_has_callbacks(cpu)) {
1908 trace_rcu_prep_idle("More callbacks");
1909 invoke_rcu_core(); 1990 invoke_rcu_core();
1910 } else { 1991 return c;
1911 trace_rcu_prep_idle("Callbacks drained");
1912 }
1913}
1914
1915/*
1916 * Keep a running count of the number of non-lazy callbacks posted
1917 * on this CPU. This running counter (which is never decremented) allows
1918 * rcu_prepare_for_idle() to detect when something out of the idle loop
1919 * posts a callback, even if an equal number of callbacks are invoked.
1920 * Of course, callbacks should only be posted from within a trace event
1921 * designed to be called from idle or from within RCU_NONIDLE().
1922 */
1923static void rcu_idle_count_callbacks_posted(void)
1924{
1925 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1926}
1927
1928/*
1929 * Data for flushing lazy RCU callbacks at OOM time.
1930 */
1931static atomic_t oom_callback_count;
1932static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1933
1934/*
1935 * RCU OOM callback -- decrement the outstanding count and deliver the
1936 * wake-up if we are the last one.
1937 */
1938static void rcu_oom_callback(struct rcu_head *rhp)
1939{
1940 if (atomic_dec_and_test(&oom_callback_count))
1941 wake_up(&oom_callback_wq);
1942}
1943
1944/*
1945 * Post an rcu_oom_notify callback on the current CPU if it has at
1946 * least one lazy callback. This will unnecessarily post callbacks
1947 * to CPUs that already have a non-lazy callback at the end of their
1948 * callback list, but this is an infrequent operation, so accept some
1949 * extra overhead to keep things simple.
1950 */
1951static void rcu_oom_notify_cpu(void *unused)
1952{
1953 struct rcu_state *rsp;
1954 struct rcu_data *rdp;
1955
1956 for_each_rcu_flavor(rsp) {
1957 rdp = __this_cpu_ptr(rsp->rda);
1958 if (rdp->qlen_lazy != 0) {
1959 atomic_inc(&oom_callback_count);
1960 rsp->call(&rdp->oom_head, rcu_oom_callback);
1961 }
1962 }
1963}
1964
1965/*
1966 * If low on memory, ensure that each CPU has a non-lazy callback.
1967 * This will wake up CPUs that have only lazy callbacks, in turn
1968 * ensuring that they free up the corresponding memory in a timely manner.
1969 * Because an uncertain amount of memory will be freed in some uncertain
1970 * timeframe, we do not claim to have freed anything.
1971 */
1972static int rcu_oom_notify(struct notifier_block *self,
1973 unsigned long notused, void *nfreed)
1974{
1975 int cpu;
1976
1977 /* Wait for callbacks from earlier instance to complete. */
1978 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1979
1980 /*
1981 * Prevent premature wakeup: ensure that all increments happen
1982 * before there is a chance of the counter reaching zero.
1983 */
1984 atomic_set(&oom_callback_count, 1);
1985
1986 get_online_cpus();
1987 for_each_online_cpu(cpu) {
1988 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1989 cond_resched();
1990 }
1991 put_online_cpus();
1992
1993 /* Unconditionally decrement: no need to wake ourselves up. */
1994 atomic_dec(&oom_callback_count);
1995
1996 return NOTIFY_OK;
1997}
1998
1999static struct notifier_block rcu_oom_nb = {
2000 .notifier_call = rcu_oom_notify
2001};
2002
2003static int __init rcu_register_oom_notifier(void)
2004{
2005 register_oom_notifier(&rcu_oom_nb);
2006 return 0;
2007}
2008early_initcall(rcu_register_oom_notifier);
2009
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2011
2012#ifdef CONFIG_RCU_CPU_STALL_INFO
2013
2014#ifdef CONFIG_RCU_FAST_NO_HZ
2015
2016static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2017{
2018 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2019 struct timer_list *tltp = &rdtp->idle_gp_timer;
2020 char c;
2021
2022 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
2023 if (timer_pending(tltp))
2024 sprintf(cp, "drain=%d %c timer=%lu",
2025 rdtp->dyntick_drain, c, tltp->expires - jiffies);
2026 else
2027 sprintf(cp, "drain=%d %c timer not pending",
2028 rdtp->dyntick_drain, c);
2029}
2030
2031#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2032
2033static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2034{
2035 *cp = '\0';
2036}
2037
2038#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
2039
2040/* Initiate the stall-info list. */
2041static void print_cpu_stall_info_begin(void)
2042{
2043 printk(KERN_CONT "\n");
2044}
2045
2046/*
2047 * Print out diagnostic information for the specified stalled CPU.
2048 *
2049 * If the specified CPU is aware of the current RCU grace period
2050 * (flavor specified by rsp), then print the number of scheduling
2051 * clock interrupts the CPU has taken during the time that it has
2052 * been aware. Otherwise, print the number of RCU grace periods
2053 * that this CPU is ignorant of, for example, "1" if the CPU was
2054 * aware of the previous grace period.
2055 *
2056 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
2057 */
2058static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2059{
2060 char fast_no_hz[72];
2061 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2062 struct rcu_dynticks *rdtp = rdp->dynticks;
2063 char *ticks_title;
2064 unsigned long ticks_value;
2065
2066 if (rsp->gpnum == rdp->gpnum) {
2067 ticks_title = "ticks this GP";
2068 ticks_value = rdp->ticks_this_gp;
2069 } else {
2070 ticks_title = "GPs behind";
2071 ticks_value = rsp->gpnum - rdp->gpnum;
2072 }
2073 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2074 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
2075 cpu, ticks_value, ticks_title,
2076 atomic_read(&rdtp->dynticks) & 0xfff,
2077 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2078 fast_no_hz);
2079}
2080
2081/* Terminate the stall-info list. */
2082static void print_cpu_stall_info_end(void)
2083{
2084 printk(KERN_ERR "\t");
2085}
2086
2087/* Zero ->ticks_this_gp for all flavors of RCU. */
2088static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2089{
2090 rdp->ticks_this_gp = 0;
2091}
2092
2093/* Increment ->ticks_this_gp for all flavors of RCU. */
2094static void increment_cpu_stall_ticks(void)
2095{
2096 struct rcu_state *rsp;
2097
2098 for_each_rcu_flavor(rsp)
2099 __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
2100}
2101
2102#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
2103
2104static void print_cpu_stall_info_begin(void)
2105{
2106 printk(KERN_CONT " {");
2107}
2108
2109static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2110{
2111 printk(KERN_CONT " %d", cpu);
2112}
2113
2114static void print_cpu_stall_info_end(void)
2115{
2116 printk(KERN_CONT "} ");
2117}
2118
2119static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2120{
2121}
2122
2123static void increment_cpu_stall_ticks(void)
2124{
2125}
2126
2127#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2128
2129#ifdef CONFIG_RCU_NOCB_CPU
2130
2131/*
2132 * Offload callback processing from the boot-time-specified set of CPUs
2133 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2134 * kthread created that pulls the callbacks from the corresponding CPU,
2135 * waits for a grace period to elapse, and invokes the callbacks.
2136 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2137 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2138 * has been specified, in which case each kthread actively polls its
2139 * CPU. (Which isn't so great for energy efficiency, but which does
2140 * reduce RCU's overhead on that CPU.)
2141 *
2142 * This is intended to be used in conjunction with Frederic Weisbecker's
2143 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2144 * running CPU-bound user-mode computations.
2145 *
2146 * Offloading of callback processing could also in theory be used as
2147 * an energy-efficiency measure because CPUs with no RCU callbacks
2148 * queued are more aggressive about entering dyntick-idle mode.
2149 */
2150
2151
2152/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2153static int __init rcu_nocb_setup(char *str)
2154{
2155 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2156 have_rcu_nocb_mask = true;
2157 cpulist_parse(str, rcu_nocb_mask);
2158 return 1;
2159}
2160__setup("rcu_nocbs=", rcu_nocb_setup);
2161
2162/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu)
2164{
2165 if (have_rcu_nocb_mask)
2166 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2167 return false;
2168}
2169
2170/*
2171 * Enqueue the specified string of rcu_head structures onto the specified
2172 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2173 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2174 * counts are supplied by rhcount and rhcount_lazy.
2175 *
2176 * If warranted, also wake up the kthread servicing this CPUs queues.
2177 */
2178static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2179 struct rcu_head *rhp,
2180 struct rcu_head **rhtp,
2181 int rhcount, int rhcount_lazy)
2182{
2183 int len;
2184 struct rcu_head **old_rhpp;
2185 struct task_struct *t;
2186
2187 /* Enqueue the callback on the nocb list and update counts. */
2188 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2189 ACCESS_ONCE(*old_rhpp) = rhp;
2190 atomic_long_add(rhcount, &rdp->nocb_q_count);
2191 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2192
2193 /* If we are not being polled and there is a kthread, awaken it ... */
2194 t = ACCESS_ONCE(rdp->nocb_kthread);
2195 if (rcu_nocb_poll | !t)
2196 return;
2197 len = atomic_long_read(&rdp->nocb_q_count);
2198 if (old_rhpp == &rdp->nocb_head) {
2199 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2200 rdp->qlen_last_fqs_check = 0;
2201 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2202 wake_up_process(t); /* ... or if many callbacks queued. */
2203 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2204 }
2205 return;
2206} 1992}
2207 1993
2208/* 1994/*
2209 * This is a helper for __call_rcu(), which invokes this when the normal 1995 * Check to see if we need to continue a callback-flush operations to
2210 * callback queue is inoperable. If this is not a no-CBs CPU, this 1996 * allow the last CPU to enter dyntick-idle mode.
2211 * function returns failure back to __call_rcu(), which can complain
2212 * appropriately.
2213 *
2214 * Otherwise, this function queues the callback where the corresponding
2215 * "rcuo" kthread can find it.
2216 */ 1997 */
2217static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 1998static void rcu_needs_cpu_flush(void)
2218 bool lazy)
2219{ 1999{
2000 int cpu = smp_processor_id();
2001 unsigned long flags;
2220 2002
2221 if (!is_nocb_cpu(rdp->cpu)) 2003 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2222 return 0;
2223 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2224 return 1;
2225}
2226
2227/*
2228 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2229 * not a no-CBs CPU.
2230 */
2231static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2232 struct rcu_data *rdp)
2233{
2234 long ql = rsp->qlen;
2235 long qll = rsp->qlen_lazy;
2236
2237 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2238 if (!is_nocb_cpu(smp_processor_id()))
2239 return 0;
2240 rsp->qlen = 0;
2241 rsp->qlen_lazy = 0;
2242
2243 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2244 if (rsp->orphan_donelist != NULL) {
2245 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2246 rsp->orphan_donetail, ql, qll);
2247 ql = qll = 0;
2248 rsp->orphan_donelist = NULL;
2249 rsp->orphan_donetail = &rsp->orphan_donelist;
2250 }
2251 if (rsp->orphan_nxtlist != NULL) {
2252 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2253 rsp->orphan_nxttail, ql, qll);
2254 ql = qll = 0;
2255 rsp->orphan_nxtlist = NULL;
2256 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2257 }
2258 return 1;
2259}
2260
2261/*
2262 * There must be at least one non-no-CBs CPU in operation at any given
2263 * time, because no-CBs CPUs are not capable of initiating grace periods
2264 * independently. This function therefore complains if the specified
2265 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2266 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2267 * but you have to have a base case!)
2268 */
2269static bool nocb_cpu_expendable(int cpu)
2270{
2271 cpumask_var_t non_nocb_cpus;
2272 int ret;
2273
2274 /*
2275 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2276 * then offlining this CPU is harmless. Let it happen.
2277 */
2278 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2279 return 1;
2280
2281 /* If no memory, play it safe and keep the CPU around. */
2282 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2283 return 0;
2284 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2285 cpumask_clear_cpu(cpu, non_nocb_cpus);
2286 ret = !cpumask_empty(non_nocb_cpus);
2287 free_cpumask_var(non_nocb_cpus);
2288 return ret;
2289}
2290
2291/*
2292 * Helper structure for remote registry of RCU callbacks.
2293 * This is needed for when a no-CBs CPU needs to start a grace period.
2294 * If it just invokes call_rcu(), the resulting callback will be queued,
2295 * which can result in deadlock.
2296 */
2297struct rcu_head_remote {
2298 struct rcu_head *rhp;
2299 call_rcu_func_t *crf;
2300 void (*func)(struct rcu_head *rhp);
2301};
2302
2303/*
2304 * Register a callback as specified by the rcu_head_remote struct.
2305 * This function is intended to be invoked via smp_call_function_single().
2306 */
2307static void call_rcu_local(void *arg)
2308{
2309 struct rcu_head_remote *rhrp =
2310 container_of(arg, struct rcu_head_remote, rhp);
2311
2312 rhrp->crf(rhrp->rhp, rhrp->func);
2313}
2314
2315/*
2316 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2317 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2318 * smp_call_function_single().
2319 */
2320static void invoke_crf_remote(struct rcu_head *rhp,
2321 void (*func)(struct rcu_head *rhp),
2322 call_rcu_func_t crf)
2323{
2324 struct rcu_head_remote rhr;
2325
2326 rhr.rhp = rhp;
2327 rhr.crf = crf;
2328 rhr.func = func;
2329 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2330}
2331
2332/*
2333 * Helper functions to be passed to wait_rcu_gp(), each of which
2334 * invokes invoke_crf_remote() to register a callback appropriately.
2335 */
2336static void __maybe_unused
2337call_rcu_preempt_remote(struct rcu_head *rhp,
2338 void (*func)(struct rcu_head *rhp))
2339{
2340 invoke_crf_remote(rhp, func, call_rcu);
2341}
2342static void call_rcu_bh_remote(struct rcu_head *rhp,
2343 void (*func)(struct rcu_head *rhp))
2344{
2345 invoke_crf_remote(rhp, func, call_rcu_bh);
2346}
2347static void call_rcu_sched_remote(struct rcu_head *rhp,
2348 void (*func)(struct rcu_head *rhp))
2349{
2350 invoke_crf_remote(rhp, func, call_rcu_sched);
2351}
2352
2353/*
2354 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2355 * callbacks queued by the corresponding no-CBs CPU.
2356 */
2357static int rcu_nocb_kthread(void *arg)
2358{
2359 int c, cl;
2360 struct rcu_head *list;
2361 struct rcu_head *next;
2362 struct rcu_head **tail;
2363 struct rcu_data *rdp = arg;
2364
2365 /* Each pass through this loop invokes one batch of callbacks */
2366 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) {
2372 schedule_timeout_interruptible(1);
2373 continue;
2374 }
2375
2376 /*
2377 * Extract queued callbacks, update counts, and wait
2378 * for a grace period to elapse.
2379 */
2380 ACCESS_ONCE(rdp->nocb_head) = NULL;
2381 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2382 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2383 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2384 ACCESS_ONCE(rdp->nocb_p_count) += c;
2385 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2386 wait_rcu_gp(rdp->rsp->call_remote);
2387
2388 /* Each pass through the following loop invokes a callback. */
2389 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2390 c = cl = 0;
2391 while (list) {
2392 next = list->next;
2393 /* Wait for enqueuing to complete, if needed. */
2394 while (next == NULL && &list->next != tail) {
2395 schedule_timeout_interruptible(1);
2396 next = list->next;
2397 }
2398 debug_rcu_head_unqueue(list);
2399 local_bh_disable();
2400 if (__rcu_reclaim(rdp->rsp->name, list))
2401 cl++;
2402 c++;
2403 local_bh_enable();
2404 list = next;
2405 }
2406 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2407 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2408 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2409 rdp->n_nocbs_invoked += c;
2410 }
2411 return 0;
2412}
2413
2414/* Initialize per-rcu_data variables for no-CBs CPUs. */
2415static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2416{
2417 rdp->nocb_tail = &rdp->nocb_head;
2418 init_waitqueue_head(&rdp->nocb_wq);
2419}
2420
2421/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2422static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2423{
2424 int cpu;
2425 struct rcu_data *rdp;
2426 struct task_struct *t;
2427
2428 if (rcu_nocb_mask == NULL)
2429 return;
2430 for_each_cpu(cpu, rcu_nocb_mask) {
2431 rdp = per_cpu_ptr(rsp->rda, cpu);
2432 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2433 BUG_ON(IS_ERR(t));
2434 ACCESS_ONCE(rdp->nocb_kthread) = t;
2435 }
2436}
2437
2438/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2439static void init_nocb_callback_list(struct rcu_data *rdp)
2440{
2441 if (rcu_nocb_mask == NULL ||
2442 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2443 return; 2004 return;
2444 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2005 local_irq_save(flags);
2445} 2006 (void)rcu_needs_cpu(cpu);
2446 2007 local_irq_restore(flags);
2447/* Initialize the ->call_remote fields in the rcu_state structures. */
2448static void __init rcu_init_nocb(void)
2449{
2450#ifdef CONFIG_PREEMPT_RCU
2451 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2452#endif /* #ifdef CONFIG_PREEMPT_RCU */
2453 rcu_bh_state.call_remote = call_rcu_bh_remote;
2454 rcu_sched_state.call_remote = call_rcu_sched_remote;
2455}
2456
2457#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2458
2459static bool is_nocb_cpu(int cpu)
2460{
2461 return false;
2462}
2463
2464static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2465 bool lazy)
2466{
2467 return 0;
2468}
2469
2470static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2471 struct rcu_data *rdp)
2472{
2473 return 0;
2474}
2475
2476static bool nocb_cpu_expendable(int cpu)
2477{
2478 return 1;
2479}
2480
2481static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2482{
2483}
2484
2485static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2486{
2487}
2488
2489static void init_nocb_callback_list(struct rcu_data *rdp)
2490{
2491}
2492
2493static void __init rcu_init_nocb(void)
2494{
2495} 2008}
2496 2009
2497#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa67..3b0c0986afc 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,62 +46,13 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
53{
54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
70
71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
72{
73 (*pos)++;
74 return r_start(m, pos);
75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
87 return 0;
88}
89
90static int rcubarrier_open(struct inode *inode, struct file *file)
91{
92 return single_open(file, show_rcubarrier, inode->i_private);
93}
94
95static const struct file_operations rcubarrier_fops = {
96 .owner = THIS_MODULE,
97 .open = rcubarrier_open,
98 .read = seq_read,
99 .llseek = no_llseek,
100 .release = seq_release,
101};
102
103#ifdef CONFIG_RCU_BOOST 49#ifdef CONFIG_RCU_BOOST
104 50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
105static char convert_kthread_status(unsigned int kthread_status) 56static char convert_kthread_status(unsigned int kthread_status)
106{ 57{
107 if (kthread_status > RCU_KTHREAD_MAX) 58 if (kthread_status > RCU_KTHREAD_MAX)
@@ -113,26 +64,24 @@ static char convert_kthread_status(unsigned int kthread_status)
113 64
114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 65static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
115{ 66{
116 long ql, qll;
117
118 if (!rdp->beenonline) 67 if (!rdp->beenonline)
119 return; 68 return;
120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 69 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
121 rdp->cpu, 70 rdp->cpu,
122 cpu_is_offline(rdp->cpu) ? '!' : ' ', 71 cpu_is_offline(rdp->cpu) ? '!' : ' ',
123 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 72 rdp->completed, rdp->gpnum,
124 rdp->passed_quiesce, rdp->qs_pending); 73 rdp->passed_quiesc, rdp->passed_quiesc_completed,
125 seq_printf(m, " dt=%d/%llx/%d df=%lu", 74 rdp->qs_pending);
75#ifdef CONFIG_NO_HZ
76 seq_printf(m, " dt=%d/%d/%d df=%lu",
126 atomic_read(&rdp->dynticks->dynticks), 77 atomic_read(&rdp->dynticks->dynticks),
127 rdp->dynticks->dynticks_nesting, 78 rdp->dynticks->dynticks_nesting,
128 rdp->dynticks->dynticks_nmi_nesting, 79 rdp->dynticks->dynticks_nmi_nesting,
129 rdp->dynticks_fqs); 80 rdp->dynticks_fqs);
130 seq_printf(m, " of=%lu", rdp->offline_fqs); 81#endif /* #ifdef CONFIG_NO_HZ */
131 rcu_nocb_q_lengths(rdp, &ql, &qll); 82 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
132 qll += rdp->qlen_lazy; 83 seq_printf(m, " ql=%ld qs=%c%c%c%c",
133 ql += rdp->qlen; 84 rdp->qlen,
134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
135 qll, ql,
136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 85 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
137 rdp->nxttail[RCU_NEXT_TAIL]], 86 rdp->nxttail[RCU_NEXT_TAIL]],
138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 87 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -141,81 +90,130 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
141 rdp->nxttail[RCU_WAIT_TAIL]], 90 rdp->nxttail[RCU_WAIT_TAIL]],
142 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); 91 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
143#ifdef CONFIG_RCU_BOOST 92#ifdef CONFIG_RCU_BOOST
144 seq_printf(m, " kt=%d/%c ktl=%x", 93 seq_printf(m, " kt=%d/%c/%d ktl=%x",
145 per_cpu(rcu_cpu_has_work, rdp->cpu), 94 per_cpu(rcu_cpu_has_work, rdp->cpu),
146 convert_kthread_status(per_cpu(rcu_cpu_kthread_status, 95 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
147 rdp->cpu)), 96 rdp->cpu)),
97 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 98 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
149#endif /* #ifdef CONFIG_RCU_BOOST */ 99#endif /* #ifdef CONFIG_RCU_BOOST */
150 seq_printf(m, " b=%ld", rdp->blimit); 100 seq_printf(m, " b=%ld", rdp->blimit);
151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", 101 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked, 102 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
154} 103}
155 104
156static int show_rcudata(struct seq_file *m, void *v) 105#define PRINT_RCU_DATA(name, func, m) \
106 do { \
107 int _p_r_d_i; \
108 \
109 for_each_possible_cpu(_p_r_d_i) \
110 func(m, &per_cpu(name, _p_r_d_i)); \
111 } while (0)
112
113static int show_rcudata(struct seq_file *m, void *unused)
157{ 114{
158 print_one_rcu_data(m, (struct rcu_data *)v); 115#ifdef CONFIG_TREE_PREEMPT_RCU
116 seq_puts(m, "rcu_preempt:\n");
117 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
118#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
119 seq_puts(m, "rcu_sched:\n");
120 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
121 seq_puts(m, "rcu_bh:\n");
122 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
159 return 0; 123 return 0;
160} 124}
161 125
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
169static int rcudata_open(struct inode *inode, struct file *file) 126static int rcudata_open(struct inode *inode, struct file *file)
170{ 127{
171 return r_open(inode, file, &rcudate_op); 128 return single_open(file, show_rcudata, NULL);
172} 129}
173 130
174static const struct file_operations rcudata_fops = { 131static const struct file_operations rcudata_fops = {
175 .owner = THIS_MODULE, 132 .owner = THIS_MODULE,
176 .open = rcudata_open, 133 .open = rcudata_open,
177 .read = seq_read, 134 .read = seq_read,
178 .llseek = no_llseek, 135 .llseek = seq_lseek,
179 .release = seq_release, 136 .release = single_release,
180}; 137};
181 138
182static int show_rcuexp(struct seq_file *m, void *v) 139static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
183{ 140{
184 struct rcu_state *rsp = (struct rcu_state *)m->private; 141 if (!rdp->beenonline)
185 142 return;
186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", 143 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
187 atomic_long_read(&rsp->expedited_start), 144 rdp->cpu,
188 atomic_long_read(&rsp->expedited_done), 145 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
189 atomic_long_read(&rsp->expedited_wrap), 146 rdp->completed, rdp->gpnum,
190 atomic_long_read(&rsp->expedited_tryfail), 147 rdp->passed_quiesc, rdp->passed_quiesc_completed,
191 atomic_long_read(&rsp->expedited_workdone1), 148 rdp->qs_pending);
192 atomic_long_read(&rsp->expedited_workdone2), 149#ifdef CONFIG_NO_HZ
193 atomic_long_read(&rsp->expedited_normal), 150 seq_printf(m, ",%d,%d,%d,%lu",
194 atomic_long_read(&rsp->expedited_stoppedcpus), 151 atomic_read(&rdp->dynticks->dynticks),
195 atomic_long_read(&rsp->expedited_done_tries), 152 rdp->dynticks->dynticks_nesting,
196 atomic_long_read(&rsp->expedited_done_lost), 153 rdp->dynticks->dynticks_nmi_nesting,
197 atomic_long_read(&rsp->expedited_done_exit)); 154 rdp->dynticks_fqs);
155#endif /* #ifdef CONFIG_NO_HZ */
156 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
157 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
158 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
159 rdp->nxttail[RCU_NEXT_TAIL]],
160 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
161 rdp->nxttail[RCU_NEXT_READY_TAIL]],
162 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
163 rdp->nxttail[RCU_WAIT_TAIL]],
164 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
165#ifdef CONFIG_RCU_BOOST
166 seq_printf(m, ",%d,\"%c\"",
167 per_cpu(rcu_cpu_has_work, rdp->cpu),
168 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
169 rdp->cpu)));
170#endif /* #ifdef CONFIG_RCU_BOOST */
171 seq_printf(m, ",%ld", rdp->blimit);
172 seq_printf(m, ",%lu,%lu,%lu\n",
173 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
174}
175
176static int show_rcudata_csv(struct seq_file *m, void *unused)
177{
178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
179#ifdef CONFIG_NO_HZ
180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
181#endif /* #ifdef CONFIG_NO_HZ */
182 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
183#ifdef CONFIG_RCU_BOOST
184 seq_puts(m, "\"kt\",\"ktl\"");
185#endif /* #ifdef CONFIG_RCU_BOOST */
186 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
187#ifdef CONFIG_TREE_PREEMPT_RCU
188 seq_puts(m, "\"rcu_preempt:\"\n");
189 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
190#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
191 seq_puts(m, "\"rcu_sched:\"\n");
192 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
193 seq_puts(m, "\"rcu_bh:\"\n");
194 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
198 return 0; 195 return 0;
199} 196}
200 197
201static int rcuexp_open(struct inode *inode, struct file *file) 198static int rcudata_csv_open(struct inode *inode, struct file *file)
202{ 199{
203 return single_open(file, show_rcuexp, inode->i_private); 200 return single_open(file, show_rcudata_csv, NULL);
204} 201}
205 202
206static const struct file_operations rcuexp_fops = { 203static const struct file_operations rcudata_csv_fops = {
207 .owner = THIS_MODULE, 204 .owner = THIS_MODULE,
208 .open = rcuexp_open, 205 .open = rcudata_csv_open,
209 .read = seq_read, 206 .read = seq_read,
210 .llseek = no_llseek, 207 .llseek = seq_lseek,
211 .release = seq_release, 208 .release = single_release,
212}; 209};
213 210
214#ifdef CONFIG_RCU_BOOST 211#ifdef CONFIG_RCU_BOOST
215 212
216static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) 213static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
217{ 214{
218 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", 215 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
216 "j=%04x bt=%04x\n",
219 rnp->grplo, rnp->grphi, 217 rnp->grplo, rnp->grphi,
220 "T."[list_empty(&rnp->blkd_tasks)], 218 "T."[list_empty(&rnp->blkd_tasks)],
221 "N."[!rnp->gp_tasks], 219 "N."[!rnp->gp_tasks],
@@ -223,11 +221,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
223 "B."[!rnp->boost_tasks], 221 "B."[!rnp->boost_tasks],
224 convert_kthread_status(rnp->boost_kthread_status), 222 convert_kthread_status(rnp->boost_kthread_status),
225 rnp->n_tasks_boosted, rnp->n_exp_boosts, 223 rnp->n_tasks_boosted, rnp->n_exp_boosts,
226 rnp->n_normal_boosts); 224 rnp->n_normal_boosts,
227 seq_printf(m, "j=%04x bt=%04x\n",
228 (int)(jiffies & 0xffff), 225 (int)(jiffies & 0xffff),
229 (int)(rnp->boost_time & 0xffff)); 226 (int)(rnp->boost_time & 0xffff));
230 seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", 227 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
228 " balk",
231 rnp->n_balk_blkd_tasks, 229 rnp->n_balk_blkd_tasks,
232 rnp->n_balk_exp_gp_tasks, 230 rnp->n_balk_exp_gp_tasks,
233 rnp->n_balk_boost_tasks, 231 rnp->n_balk_boost_tasks,
@@ -254,11 +252,27 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 252 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 253 .open = rcu_node_boost_open,
256 .read = seq_read, 254 .read = seq_read,
257 .llseek = no_llseek, 255 .llseek = seq_lseek,
258 .release = single_release, 256 .release = single_release,
259}; 257};
260 258
261#endif /* #ifdef CONFIG_RCU_BOOST */ 259/*
260 * Create the rcuboost debugfs entry. Standard error return.
261 */
262static int rcu_boost_trace_create_file(struct dentry *rcudir)
263{
264 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
265 &rcu_node_boost_fops);
266}
267
268#else /* #ifdef CONFIG_RCU_BOOST */
269
270static int rcu_boost_trace_create_file(struct dentry *rcudir)
271{
272 return 0; /* There cannot be an error if we didn't create it! */
273}
274
275#endif /* #else #ifdef CONFIG_RCU_BOOST */
262 276
263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 277static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
264{ 278{
@@ -267,16 +281,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
267 struct rcu_node *rnp; 281 struct rcu_node *rnp;
268 282
269 gpnum = rsp->gpnum; 283 gpnum = rsp->gpnum;
270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", 284 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
271 ulong2long(rsp->completed), ulong2long(gpnum), 285 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
272 rsp->fqs_state, 286 rsp->completed, gpnum, rsp->signaled,
273 (long)(rsp->jiffies_force_qs - jiffies), 287 (long)(rsp->jiffies_force_qs - jiffies),
274 (int)(jiffies & 0xffff)); 288 (int)(jiffies & 0xffff),
275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
276 rsp->n_force_qs, rsp->n_force_qs_ngp, 289 rsp->n_force_qs, rsp->n_force_qs_ngp,
277 rsp->n_force_qs - rsp->n_force_qs_ngp, 290 rsp->n_force_qs - rsp->n_force_qs_ngp,
278 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 291 rsp->n_force_qs_lh);
279 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 292 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
280 if (rnp->level != level) { 293 if (rnp->level != level) {
281 seq_puts(m, "\n"); 294 seq_puts(m, "\n");
282 level = rnp->level; 295 level = rnp->level;
@@ -291,24 +304,30 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
291 seq_puts(m, "\n"); 304 seq_puts(m, "\n");
292} 305}
293 306
294static int show_rcuhier(struct seq_file *m, void *v) 307static int show_rcuhier(struct seq_file *m, void *unused)
295{ 308{
296 struct rcu_state *rsp = (struct rcu_state *)m->private; 309#ifdef CONFIG_TREE_PREEMPT_RCU
297 print_one_rcu_state(m, rsp); 310 seq_puts(m, "rcu_preempt:\n");
311 print_one_rcu_state(m, &rcu_preempt_state);
312#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
313 seq_puts(m, "rcu_sched:\n");
314 print_one_rcu_state(m, &rcu_sched_state);
315 seq_puts(m, "rcu_bh:\n");
316 print_one_rcu_state(m, &rcu_bh_state);
298 return 0; 317 return 0;
299} 318}
300 319
301static int rcuhier_open(struct inode *inode, struct file *file) 320static int rcuhier_open(struct inode *inode, struct file *file)
302{ 321{
303 return single_open(file, show_rcuhier, inode->i_private); 322 return single_open(file, show_rcuhier, NULL);
304} 323}
305 324
306static const struct file_operations rcuhier_fops = { 325static const struct file_operations rcuhier_fops = {
307 .owner = THIS_MODULE, 326 .owner = THIS_MODULE,
308 .open = rcuhier_open, 327 .open = rcuhier_open,
309 .read = seq_read, 328 .read = seq_read,
310 .llseek = no_llseek, 329 .llseek = seq_lseek,
311 .release = seq_release, 330 .release = single_release,
312}; 331};
313 332
314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 333static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -321,81 +340,95 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
321 struct rcu_node *rnp = &rsp->node[0]; 340 struct rcu_node *rnp = &rsp->node[0];
322 341
323 raw_spin_lock_irqsave(&rnp->lock, flags); 342 raw_spin_lock_irqsave(&rnp->lock, flags);
324 completed = ACCESS_ONCE(rsp->completed); 343 completed = rsp->completed;
325 gpnum = ACCESS_ONCE(rsp->gpnum); 344 gpnum = rsp->gpnum;
326 if (completed == gpnum) 345 if (rsp->completed == rsp->gpnum)
327 gpage = 0; 346 gpage = 0;
328 else 347 else
329 gpage = jiffies - rsp->gp_start; 348 gpage = jiffies - rsp->gp_start;
330 gpmax = rsp->gp_max; 349 gpmax = rsp->gp_max;
331 raw_spin_unlock_irqrestore(&rnp->lock, flags); 350 raw_spin_unlock_irqrestore(&rnp->lock, flags);
332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", 351 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax); 352 rsp->name, completed, gpnum, gpage, gpmax);
334} 353}
335 354
336static int show_rcugp(struct seq_file *m, void *v) 355static int show_rcugp(struct seq_file *m, void *unused)
337{ 356{
338 struct rcu_state *rsp = (struct rcu_state *)m->private; 357#ifdef CONFIG_TREE_PREEMPT_RCU
339 show_one_rcugp(m, rsp); 358 show_one_rcugp(m, &rcu_preempt_state);
359#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
360 show_one_rcugp(m, &rcu_sched_state);
361 show_one_rcugp(m, &rcu_bh_state);
340 return 0; 362 return 0;
341} 363}
342 364
343static int rcugp_open(struct inode *inode, struct file *file) 365static int rcugp_open(struct inode *inode, struct file *file)
344{ 366{
345 return single_open(file, show_rcugp, inode->i_private); 367 return single_open(file, show_rcugp, NULL);
346} 368}
347 369
348static const struct file_operations rcugp_fops = { 370static const struct file_operations rcugp_fops = {
349 .owner = THIS_MODULE, 371 .owner = THIS_MODULE,
350 .open = rcugp_open, 372 .open = rcugp_open,
351 .read = seq_read, 373 .read = seq_read,
352 .llseek = no_llseek, 374 .llseek = seq_lseek,
353 .release = seq_release, 375 .release = single_release,
354}; 376};
355 377
356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 378static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
357{ 379{
358 if (!rdp->beenonline) 380 seq_printf(m, "%3d%cnp=%ld "
359 return; 381 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
360 seq_printf(m, "%3d%cnp=%ld ", 382 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
361 rdp->cpu, 383 rdp->cpu,
362 cpu_is_offline(rdp->cpu) ? '!' : ' ', 384 cpu_is_offline(rdp->cpu) ? '!' : ' ',
363 rdp->n_rcu_pending); 385 rdp->n_rcu_pending,
364 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
365 rdp->n_rp_qs_pending, 386 rdp->n_rp_qs_pending,
366 rdp->n_rp_report_qs, 387 rdp->n_rp_report_qs,
367 rdp->n_rp_cb_ready, 388 rdp->n_rp_cb_ready,
368 rdp->n_rp_cpu_needs_gp); 389 rdp->n_rp_cpu_needs_gp,
369 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
370 rdp->n_rp_gp_completed, 390 rdp->n_rp_gp_completed,
371 rdp->n_rp_gp_started, 391 rdp->n_rp_gp_started,
392 rdp->n_rp_need_fqs,
372 rdp->n_rp_need_nothing); 393 rdp->n_rp_need_nothing);
373} 394}
374 395
375static int show_rcu_pending(struct seq_file *m, void *v) 396static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
376{ 397{
377 print_one_rcu_pending(m, (struct rcu_data *)v); 398 int cpu;
378 return 0; 399 struct rcu_data *rdp;
400
401 for_each_possible_cpu(cpu) {
402 rdp = per_cpu_ptr(rsp->rda, cpu);
403 if (rdp->beenonline)
404 print_one_rcu_pending(m, rdp);
405 }
379} 406}
380 407
381static const struct seq_operations rcu_pending_op = { 408static int show_rcu_pending(struct seq_file *m, void *unused)
382 .start = r_start, 409{
383 .next = r_next, 410#ifdef CONFIG_TREE_PREEMPT_RCU
384 .stop = r_stop, 411 seq_puts(m, "rcu_preempt:\n");
385 .show = show_rcu_pending, 412 print_rcu_pendings(m, &rcu_preempt_state);
386}; 413#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
414 seq_puts(m, "rcu_sched:\n");
415 print_rcu_pendings(m, &rcu_sched_state);
416 seq_puts(m, "rcu_bh:\n");
417 print_rcu_pendings(m, &rcu_bh_state);
418 return 0;
419}
387 420
388static int rcu_pending_open(struct inode *inode, struct file *file) 421static int rcu_pending_open(struct inode *inode, struct file *file)
389{ 422{
390 return r_open(inode, file, &rcu_pending_op); 423 return single_open(file, show_rcu_pending, NULL);
391} 424}
392 425
393static const struct file_operations rcu_pending_fops = { 426static const struct file_operations rcu_pending_fops = {
394 .owner = THIS_MODULE, 427 .owner = THIS_MODULE,
395 .open = rcu_pending_open, 428 .open = rcu_pending_open,
396 .read = seq_read, 429 .read = seq_read,
397 .llseek = no_llseek, 430 .llseek = seq_lseek,
398 .release = seq_release, 431 .release = single_release,
399}; 432};
400 433
401static int show_rcutorture(struct seq_file *m, void *unused) 434static int show_rcutorture(struct seq_file *m, void *unused)
@@ -425,58 +458,38 @@ static struct dentry *rcudir;
425 458
426static int __init rcutree_trace_init(void) 459static int __init rcutree_trace_init(void)
427{ 460{
428 struct rcu_state *rsp;
429 struct dentry *retval; 461 struct dentry *retval;
430 struct dentry *rspdir;
431 462
432 rcudir = debugfs_create_dir("rcu", NULL); 463 rcudir = debugfs_create_dir("rcu", NULL);
433 if (!rcudir) 464 if (!rcudir)
434 goto free_out; 465 goto free_out;
435 466
436 for_each_rcu_flavor(rsp) { 467 retval = debugfs_create_file("rcudata", 0444, rcudir,
437 rspdir = debugfs_create_dir(rsp->name, rcudir); 468 NULL, &rcudata_fops);
438 if (!rspdir) 469 if (!retval)
439 goto free_out; 470 goto free_out;
440
441 retval = debugfs_create_file("rcudata", 0444,
442 rspdir, rsp, &rcudata_fops);
443 if (!retval)
444 goto free_out;
445
446 retval = debugfs_create_file("rcuexp", 0444,
447 rspdir, rsp, &rcuexp_fops);
448 if (!retval)
449 goto free_out;
450 471
451 retval = debugfs_create_file("rcu_pending", 0444, 472 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
452 rspdir, rsp, &rcu_pending_fops); 473 NULL, &rcudata_csv_fops);
453 if (!retval) 474 if (!retval)
454 goto free_out; 475 goto free_out;
455 476
456 retval = debugfs_create_file("rcubarrier", 0444, 477 if (rcu_boost_trace_create_file(rcudir))
457 rspdir, rsp, &rcubarrier_fops); 478 goto free_out;
458 if (!retval)
459 goto free_out;
460 479
461#ifdef CONFIG_RCU_BOOST 480 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
462 if (rsp == &rcu_preempt_state) { 481 if (!retval)
463 retval = debugfs_create_file("rcuboost", 0444, 482 goto free_out;
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
469 483
470 retval = debugfs_create_file("rcugp", 0444, 484 retval = debugfs_create_file("rcuhier", 0444, rcudir,
471 rspdir, rsp, &rcugp_fops); 485 NULL, &rcuhier_fops);
472 if (!retval) 486 if (!retval)
473 goto free_out; 487 goto free_out;
474 488
475 retval = debugfs_create_file("rcuhier", 0444, 489 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
476 rspdir, rsp, &rcuhier_fops); 490 NULL, &rcu_pending_fops);
477 if (!retval) 491 if (!retval)
478 goto free_out; 492 goto free_out;
479 }
480 493
481 retval = debugfs_create_file("rcutorture", 0444, rcudir, 494 retval = debugfs_create_file("rcutorture", 0444, rcudir,
482 NULL, &rcutorture_fops); 495 NULL, &rcutorture_fops);
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abb..859ea5a9605 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/export.h> 18#include <linux/module.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/relay.h> 20#include <linux/relay.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
@@ -164,14 +164,10 @@ depopulate:
164 */ 164 */
165static struct rchan_buf *relay_create_buf(struct rchan *chan) 165static struct rchan_buf *relay_create_buf(struct rchan *chan)
166{ 166{
167 struct rchan_buf *buf; 167 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
168
169 if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
170 return NULL;
171
172 buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
173 if (!buf) 168 if (!buf)
174 return NULL; 169 return NULL;
170
175 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); 171 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
176 if (!buf->padding) 172 if (!buf->padding)
177 goto free_buf; 173 goto free_buf;
@@ -306,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
306 */ 302 */
307static struct dentry *create_buf_file_default_callback(const char *filename, 303static struct dentry *create_buf_file_default_callback(const char *filename,
308 struct dentry *parent, 304 struct dentry *parent,
309 umode_t mode, 305 int mode,
310 struct rchan_buf *buf, 306 struct rchan_buf *buf,
311 int *is_global) 307 int *is_global)
312{ 308{
@@ -578,8 +574,6 @@ struct rchan *relay_open(const char *base_filename,
578 574
579 if (!(subbuf_size && n_subbufs)) 575 if (!(subbuf_size && n_subbufs))
580 return NULL; 576 return NULL;
581 if (subbuf_size > UINT_MAX / n_subbufs)
582 return NULL;
583 577
584 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); 578 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
585 if (!chan) 579 if (!chan)
@@ -1235,7 +1229,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
1235 struct splice_pipe_desc spd = { 1229 struct splice_pipe_desc spd = {
1236 .pages = pages, 1230 .pages = pages,
1237 .nr_pages = 0, 1231 .nr_pages = 0,
1238 .nr_pages_max = PIPE_DEF_BUFFERS,
1239 .partial = partial, 1232 .partial = partial,
1240 .flags = flags, 1233 .flags = flags,
1241 .ops = &relay_pipe_buf_ops, 1234 .ops = &relay_pipe_buf_ops,
@@ -1303,8 +1296,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1303 ret += padding; 1296 ret += padding;
1304 1297
1305out: 1298out:
1306 splice_shrink_spd(&spd); 1299 splice_shrink_spd(pipe, &spd);
1307 return ret; 1300 return ret;
1308} 1301}
1309 1302
1310static ssize_t relay_file_splice_read(struct file *in, 1303static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e704..34683efa2cc 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,104 +22,72 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val, 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
26 bool force)
27{ 26{
28 int ret = 0;
29
30 if (counter->usage + val > counter->limit) { 27 if (counter->usage + val > counter->limit) {
31 counter->failcnt++; 28 counter->failcnt++;
32 ret = -ENOMEM; 29 return -ENOMEM;
33 if (!force)
34 return ret;
35 } 30 }
36 31
37 counter->usage += val; 32 counter->usage += val;
38 if (counter->usage > counter->max_usage) 33 if (counter->usage > counter->max_usage)
39 counter->max_usage = counter->usage; 34 counter->max_usage = counter->usage;
40 return ret; 35 return 0;
41} 36}
42 37
43static int __res_counter_charge(struct res_counter *counter, unsigned long val, 38int res_counter_charge(struct res_counter *counter, unsigned long val,
44 struct res_counter **limit_fail_at, bool force) 39 struct res_counter **limit_fail_at)
45{ 40{
46 int ret, r; 41 int ret;
47 unsigned long flags; 42 unsigned long flags;
48 struct res_counter *c, *u; 43 struct res_counter *c, *u;
49 44
50 r = ret = 0;
51 *limit_fail_at = NULL; 45 *limit_fail_at = NULL;
52 local_irq_save(flags); 46 local_irq_save(flags);
53 for (c = counter; c != NULL; c = c->parent) { 47 for (c = counter; c != NULL; c = c->parent) {
54 spin_lock(&c->lock); 48 spin_lock(&c->lock);
55 r = res_counter_charge_locked(c, val, force); 49 ret = res_counter_charge_locked(c, val);
56 spin_unlock(&c->lock); 50 spin_unlock(&c->lock);
57 if (r < 0 && !ret) { 51 if (ret < 0) {
58 ret = r;
59 *limit_fail_at = c; 52 *limit_fail_at = c;
60 if (!force) 53 goto undo;
61 break;
62 } 54 }
63 } 55 }
64 56 ret = 0;
65 if (ret < 0 && !force) { 57 goto done;
66 for (u = counter; u != c; u = u->parent) { 58undo:
67 spin_lock(&u->lock); 59 for (u = counter; u != c; u = u->parent) {
68 res_counter_uncharge_locked(u, val); 60 spin_lock(&u->lock);
69 spin_unlock(&u->lock); 61 res_counter_uncharge_locked(u, val);
70 } 62 spin_unlock(&u->lock);
71 } 63 }
64done:
72 local_irq_restore(flags); 65 local_irq_restore(flags);
73
74 return ret; 66 return ret;
75} 67}
76 68
77int res_counter_charge(struct res_counter *counter, unsigned long val, 69void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
78 struct res_counter **limit_fail_at)
79{
80 return __res_counter_charge(counter, val, limit_fail_at, false);
81}
82
83int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
84 struct res_counter **limit_fail_at)
85{
86 return __res_counter_charge(counter, val, limit_fail_at, true);
87}
88
89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 70{
91 if (WARN_ON(counter->usage < val)) 71 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 72 val = counter->usage;
93 73
94 counter->usage -= val; 74 counter->usage -= val;
95 return counter->usage;
96} 75}
97 76
98u64 res_counter_uncharge_until(struct res_counter *counter, 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
99 struct res_counter *top,
100 unsigned long val)
101{ 78{
102 unsigned long flags; 79 unsigned long flags;
103 struct res_counter *c; 80 struct res_counter *c;
104 u64 ret = 0;
105 81
106 local_irq_save(flags); 82 local_irq_save(flags);
107 for (c = counter; c != top; c = c->parent) { 83 for (c = counter; c != NULL; c = c->parent) {
108 u64 r;
109 spin_lock(&c->lock); 84 spin_lock(&c->lock);
110 r = res_counter_uncharge_locked(c, val); 85 res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
113 spin_unlock(&c->lock); 86 spin_unlock(&c->lock);
114 } 87 }
115 local_irq_restore(flags); 88 local_irq_restore(flags);
116 return ret;
117} 89}
118 90
119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
120{
121 return res_counter_uncharge_until(counter, NULL, val);
122}
123 91
124static inline unsigned long long * 92static inline unsigned long long *
125res_counter_member(struct res_counter *counter, int member) 93res_counter_member(struct res_counter *counter, int member)
@@ -191,10 +159,33 @@ int res_counter_memparse_write_strategy(const char *buf,
191 return 0; 159 return 0;
192 } 160 }
193 161
194 *res = memparse(buf, &end); 162 /* FIXME - make memparse() take const char* args */
163 *res = memparse((char *)buf, &end);
195 if (*end != '\0') 164 if (*end != '\0')
196 return -EINVAL; 165 return -EINVAL;
197 166
198 *res = PAGE_ALIGN(*res); 167 *res = PAGE_ALIGN(*res);
199 return 0; 168 return 0;
200} 169}
170
171int res_counter_write(struct res_counter *counter, int member,
172 const char *buf, write_strategy_fn write_strategy)
173{
174 char *end;
175 unsigned long flags;
176 unsigned long long tmp, *val;
177
178 if (write_strategy) {
179 if (write_strategy(buf, &tmp))
180 return -EINVAL;
181 } else {
182 tmp = simple_strtoull(buf, &end, 10);
183 if (*end != '\0')
184 return -EINVAL;
185 }
186 spin_lock_irqsave(&counter->lock, flags);
187 val = res_counter_member(counter, member);
188 *val = tmp;
189 spin_unlock_irqrestore(&counter->lock, flags);
190 return 0;
191}
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b..c8dc249da5c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,9 +7,7 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10#include <linux/module.h>
11
12#include <linux/export.h>
13#include <linux/errno.h> 11#include <linux/errno.h>
14#include <linux/ioport.h> 12#include <linux/ioport.h>
15#include <linux/init.h> 13#include <linux/init.h>
@@ -517,8 +515,8 @@ out:
517 * @root: root resource descriptor 515 * @root: root resource descriptor
518 * @new: resource descriptor desired by caller 516 * @new: resource descriptor desired by caller
519 * @size: requested resource region size 517 * @size: requested resource region size
520 * @min: minimum boundary to allocate 518 * @min: minimum size to allocate
521 * @max: maximum boundary to allocate 519 * @max: maximum size to allocate
522 * @align: alignment requested, in bytes 520 * @align: alignment requested, in bytes
523 * @alignf: alignment function, optional, called if not NULL 521 * @alignf: alignment function, optional, called if not NULL
524 * @alignf_data: arbitrary data to pass to the @alignf function 522 * @alignf_data: arbitrary data to pass to the @alignf function
@@ -724,12 +722,14 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
724 722
725 write_lock(&resource_lock); 723 write_lock(&resource_lock);
726 724
727 if (!parent)
728 goto skip;
729
730 if ((start < parent->start) || (end > parent->end)) 725 if ((start < parent->start) || (end > parent->end))
731 goto out; 726 goto out;
732 727
728 for (tmp = res->child; tmp; tmp = tmp->sibling) {
729 if ((tmp->start < start) || (tmp->end > end))
730 goto out;
731 }
732
733 if (res->sibling && (res->sibling->start <= end)) 733 if (res->sibling && (res->sibling->start <= end))
734 goto out; 734 goto out;
735 735
@@ -741,11 +741,6 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
741 goto out; 741 goto out;
742 } 742 }
743 743
744skip:
745 for (tmp = res->child; tmp; tmp = tmp->sibling)
746 if ((tmp->start < start) || (tmp->end > end))
747 goto out;
748
749 res->start = start; 744 res->start = start;
750 res->end = end; 745 res->end = end;
751 result = 0; 746 result = 0;
@@ -754,7 +749,6 @@ skip:
754 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
755 return result; 750 return result;
756} 751}
757EXPORT_SYMBOL(adjust_resource);
758 752
759static void __init __reserve_region_with_split(struct resource *root, 753static void __init __reserve_region_with_split(struct resource *root,
760 resource_size_t start, resource_size_t end, 754 resource_size_t start, resource_size_t end,
@@ -763,7 +757,6 @@ static void __init __reserve_region_with_split(struct resource *root,
763 struct resource *parent = root; 757 struct resource *parent = root;
764 struct resource *conflict; 758 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 759 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
766 struct resource *next_res = NULL;
767 760
768 if (!res) 761 if (!res)
769 return; 762 return;
@@ -773,77 +766,34 @@ static void __init __reserve_region_with_split(struct resource *root,
773 res->end = end; 766 res->end = end;
774 res->flags = IORESOURCE_BUSY; 767 res->flags = IORESOURCE_BUSY;
775 768
776 while (1) { 769 conflict = __request_resource(parent, res);
777 770 if (!conflict)
778 conflict = __request_resource(parent, res); 771 return;
779 if (!conflict) {
780 if (!next_res)
781 break;
782 res = next_res;
783 next_res = NULL;
784 continue;
785 }
786 772
787 /* conflict covered whole area */ 773 /* failed, split and try again */
788 if (conflict->start <= res->start && 774 kfree(res);
789 conflict->end >= res->end) {
790 kfree(res);
791 WARN_ON(next_res);
792 break;
793 }
794 775
795 /* failed, split and try again */ 776 /* conflict covered whole area */
796 if (conflict->start > res->start) { 777 if (conflict->start <= start && conflict->end >= end)
797 end = res->end; 778 return;
798 res->end = conflict->start - 1;
799 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res),
801 GFP_ATOMIC);
802 if (!next_res) {
803 kfree(res);
804 break;
805 }
806 next_res->name = name;
807 next_res->start = conflict->end + 1;
808 next_res->end = end;
809 next_res->flags = IORESOURCE_BUSY;
810 }
811 } else {
812 res->start = conflict->end + 1;
813 }
814 }
815 779
780 if (conflict->start > start)
781 __reserve_region_with_split(root, start, conflict->start-1, name);
782 if (conflict->end < end)
783 __reserve_region_with_split(root, conflict->end+1, end, name);
816} 784}
817 785
818void __init reserve_region_with_split(struct resource *root, 786void __init reserve_region_with_split(struct resource *root,
819 resource_size_t start, resource_size_t end, 787 resource_size_t start, resource_size_t end,
820 const char *name) 788 const char *name)
821{ 789{
822 int abort = 0;
823
824 write_lock(&resource_lock); 790 write_lock(&resource_lock);
825 if (root->start > start || root->end < end) { 791 __reserve_region_with_split(root, start, end, name);
826 pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
827 (unsigned long long)start, (unsigned long long)end,
828 root);
829 if (start > root->end || end < root->start)
830 abort = 1;
831 else {
832 if (end > root->end)
833 end = root->end;
834 if (start < root->start)
835 start = root->start;
836 pr_err("fixing request to [0x%llx-0x%llx]\n",
837 (unsigned long long)start,
838 (unsigned long long)end);
839 }
840 dump_stack();
841 }
842 if (!abort)
843 __reserve_region_with_split(root, start, end, name);
844 write_unlock(&resource_lock); 792 write_unlock(&resource_lock);
845} 793}
846 794
795EXPORT_SYMBOL(adjust_resource);
796
847/** 797/**
848 * resource_alignment - calculate resource's alignment 798 * resource_alignment - calculate resource's alignment
849 * @res: resource pointer 799 * @res: resource pointer
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c..3c7cbc2c33b 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/export.h> 21#include <linux/module.h>
22#include <linux/spinlock.h> 22#include <linux/spinlock.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
@@ -29,6 +29,61 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32# define TRACE_WARN_ON(x) WARN_ON(x)
33# define TRACE_BUG_ON(x) BUG_ON(x)
34
35# define TRACE_OFF() \
36do { \
37 if (rt_trace_on) { \
38 rt_trace_on = 0; \
39 console_verbose(); \
40 if (raw_spin_is_locked(&current->pi_lock)) \
41 raw_spin_unlock(&current->pi_lock); \
42 } \
43} while (0)
44
45# define TRACE_OFF_NOLOCK() \
46do { \
47 if (rt_trace_on) { \
48 rt_trace_on = 0; \
49 console_verbose(); \
50 } \
51} while (0)
52
53# define TRACE_BUG_LOCKED() \
54do { \
55 TRACE_OFF(); \
56 BUG(); \
57} while (0)
58
59# define TRACE_WARN_ON_LOCKED(c) \
60do { \
61 if (unlikely(c)) { \
62 TRACE_OFF(); \
63 WARN_ON(1); \
64 } \
65} while (0)
66
67# define TRACE_BUG_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) \
70 TRACE_BUG_LOCKED(); \
71} while (0)
72
73#ifdef CONFIG_SMP
74# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
75#else
76# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
77#endif
78
79/*
80 * deadlock detection flag. We turn it off when we detect
81 * the first problem because we dont want to recurse back
82 * into the tracing code when doing error printk or
83 * executing a BUG():
84 */
85static int rt_trace_on = 1;
86
32static void printk_task(struct task_struct *p) 87static void printk_task(struct task_struct *p)
33{ 88{
34 if (p) 89 if (p)
@@ -56,8 +111,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
56 111
57void rt_mutex_debug_task_free(struct task_struct *task) 112void rt_mutex_debug_task_free(struct task_struct *task)
58{ 113{
59 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 114 WARN_ON(!plist_head_empty(&task->pi_waiters));
60 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 115 WARN_ON(task->pi_blocked_on);
61} 116}
62 117
63/* 118/*
@@ -70,7 +125,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
70{ 125{
71 struct task_struct *task; 126 struct task_struct *task;
72 127
73 if (!debug_locks || detect || !act_waiter) 128 if (!rt_trace_on || detect || !act_waiter)
74 return; 129 return;
75 130
76 task = rt_mutex_owner(act_waiter->lock); 131 task = rt_mutex_owner(act_waiter->lock);
@@ -84,7 +139,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
84{ 139{
85 struct task_struct *task; 140 struct task_struct *task;
86 141
87 if (!waiter->deadlock_lock || !debug_locks) 142 if (!waiter->deadlock_lock || !rt_trace_on)
88 return; 143 return;
89 144
90 rcu_read_lock(); 145 rcu_read_lock();
@@ -94,14 +149,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
94 return; 149 return;
95 } 150 }
96 151
97 if (!debug_locks_off()) { 152 TRACE_OFF_NOLOCK();
98 rcu_read_unlock();
99 return;
100 }
101 153
102 printk("\n============================================\n"); 154 printk("\n============================================\n");
103 printk( "[ BUG: circular locking deadlock detected! ]\n"); 155 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
105 printk( "--------------------------------------------\n"); 156 printk( "--------------------------------------------\n");
106 printk("%s/%d is deadlocking current task %s/%d\n\n", 157 printk("%s/%d is deadlocking current task %s/%d\n\n",
107 task->comm, task_pid_nr(task), 158 task->comm, task_pid_nr(task),
@@ -129,6 +180,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
129 180
130 printk("[ turning off deadlock detection." 181 printk("[ turning off deadlock detection."
131 "Please report this trace. ]\n\n"); 182 "Please report this trace. ]\n\n");
183 local_irq_disable();
132} 184}
133 185
134void debug_rt_mutex_lock(struct rt_mutex *lock) 186void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -137,7 +189,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
137 189
138void debug_rt_mutex_unlock(struct rt_mutex *lock) 190void debug_rt_mutex_unlock(struct rt_mutex *lock)
139{ 191{
140 DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); 192 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
141} 193}
142 194
143void 195void
@@ -147,7 +199,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
147 199
148void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) 200void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
149{ 201{
150 DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); 202 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
151} 203}
152 204
153void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 205void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -161,8 +213,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
161void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 213void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
162{ 214{
163 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
164 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
165 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
166 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
167} 219}
168 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec4947546..5c9ccd38096 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> 6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 * 7 *
8 */ 8 */
9#include <linux/device.h>
10#include <linux/kthread.h> 9#include <linux/kthread.h>
11#include <linux/export.h> 10#include <linux/module.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/sysdev.h>
14#include <linux/timer.h> 14#include <linux/timer.h>
15#include <linux/freezer.h> 15#include <linux/freezer.h>
16 16
@@ -27,7 +27,7 @@ struct test_thread_data {
27 int opdata; 27 int opdata;
28 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
29 int event; 29 int event;
30 struct device dev; 30 struct sys_device sysdev;
31}; 31};
32 32
33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; 33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
271 * 271 *
272 * opcode:data 272 * opcode:data
273 */ 273 */
274static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, 274static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
275 const char *buf, size_t count) 275 const char *buf, size_t count)
276{ 276{
277 struct sched_param schedpar; 277 struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *a
279 char cmdbuf[32]; 279 char cmdbuf[32];
280 int op, dat, tid, ret; 280 int op, dat, tid, ret;
281 281
282 td = container_of(dev, struct test_thread_data, dev); 282 td = container_of(dev, struct test_thread_data, sysdev);
283 tid = td->dev.id; 283 tid = td->sysdev.id;
284 284
285 /* strings from sysfs write are not 0 terminated! */ 285 /* strings from sysfs write are not 0 terminated! */
286 if (count >= sizeof(cmdbuf)) 286 if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *a
334 * @dev: thread to query 334 * @dev: thread to query
335 * @buf: char buffer to be filled with thread status info 335 * @buf: char buffer to be filled with thread status info
336 */ 336 */
337static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, 337static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
338 char *buf) 338 char *buf)
339{ 339{
340 struct test_thread_data *td; 340 struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
342 char *curr = buf; 342 char *curr = buf;
343 int i; 343 int i;
344 344
345 td = container_of(dev, struct test_thread_data, dev); 345 td = container_of(dev, struct test_thread_data, sysdev);
346 tsk = threads[td->dev.id]; 346 tsk = threads[td->sysdev.id];
347 347
348 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
349 349
@@ -360,29 +360,28 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
360 spin_unlock(&rttest_lock); 360 spin_unlock(&rttest_lock);
361 361
362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk, 362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
363 mutexes[td->dev.id].owner); 363 mutexes[td->sysdev.id].owner);
364 364
365 return curr - buf; 365 return curr - buf;
366} 366}
367 367
368static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); 368static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
369static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); 369static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
370 370
371static struct bus_type rttest_subsys = { 371static struct sysdev_class rttest_sysclass = {
372 .name = "rttest", 372 .name = "rttest",
373 .dev_name = "rttest",
374}; 373};
375 374
376static int init_test_thread(int id) 375static int init_test_thread(int id)
377{ 376{
378 thread_data[id].dev.bus = &rttest_subsys; 377 thread_data[id].sysdev.cls = &rttest_sysclass;
379 thread_data[id].dev.id = id; 378 thread_data[id].sysdev.id = id;
380 379
381 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); 380 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
382 if (IS_ERR(threads[id])) 381 if (IS_ERR(threads[id]))
383 return PTR_ERR(threads[id]); 382 return PTR_ERR(threads[id]);
384 383
385 return device_register(&thread_data[id].dev); 384 return sysdev_register(&thread_data[id].sysdev);
386} 385}
387 386
388static int init_rttest(void) 387static int init_rttest(void)
@@ -394,7 +393,7 @@ static int init_rttest(void)
394 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) 393 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
395 rt_mutex_init(&mutexes[i]); 394 rt_mutex_init(&mutexes[i]);
396 395
397 ret = subsys_system_register(&rttest_subsys, NULL); 396 ret = sysdev_class_register(&rttest_sysclass);
398 if (ret) 397 if (ret)
399 return ret; 398 return ret;
400 399
@@ -402,10 +401,10 @@ static int init_rttest(void)
402 ret = init_test_thread(i); 401 ret = init_test_thread(i);
403 if (ret) 402 if (ret)
404 break; 403 break;
405 ret = device_create_file(&thread_data[i].dev, &dev_attr_status); 404 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
406 if (ret) 405 if (ret)
407 break; 406 break;
408 ret = device_create_file(&thread_data[i].dev, &dev_attr_command); 407 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
409 if (ret) 408 if (ret)
410 break; 409 break;
411 } 410 }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c99..255e1662acd 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
11 * See Documentation/rt-mutex-design.txt for details. 11 * See Documentation/rt-mutex-design.txt for details.
12 */ 12 */
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/module.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/timer.h> 16#include <linux/timer.h>
17 17
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd84..9f48f3d82e9 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,9 +7,10 @@
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/export.h> 10#include <linux/module.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h>
13#include <linux/atomic.h> 14#include <linux/atomic.h>
14 15
15/* 16/*
@@ -116,16 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
116 117
117EXPORT_SYMBOL(down_read_nested); 118EXPORT_SYMBOL(down_read_nested);
118 119
119void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
120{
121 might_sleep();
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
125}
126
127EXPORT_SYMBOL(_down_write_nest_lock);
128
129void down_write_nested(struct rw_semaphore *sem, int subclass) 120void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 121{
131 might_sleep(); 122 might_sleep();
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
deleted file mode 100644
index f06d249e103..00000000000
--- a/kernel/sched/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
deleted file mode 100644
index 0984a21076a..00000000000
--- a/kernel/sched/auto_group.c
+++ /dev/null
@@ -1,258 +0,0 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include "sched.h"
4
5#include <linux/proc_fs.h>
6#include <linux/seq_file.h>
7#include <linux/kallsyms.h>
8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
11
12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
13static struct autogroup autogroup_default;
14static atomic_t autogroup_seq_nr;
15
16void __init autogroup_init(struct task_struct *init_task)
17{
18 autogroup_default.tg = &root_task_group;
19 kref_init(&autogroup_default.kref);
20 init_rwsem(&autogroup_default.lock);
21 init_task->signal->autogroup = &autogroup_default;
22}
23
24void autogroup_free(struct task_group *tg)
25{
26 kfree(tg->autogroup);
27}
28
29static inline void autogroup_destroy(struct kref *kref)
30{
31 struct autogroup *ag = container_of(kref, struct autogroup, kref);
32
33#ifdef CONFIG_RT_GROUP_SCHED
34 /* We've redirected RT tasks to the root task group... */
35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL;
37#endif
38 sched_destroy_group(ag->tg);
39}
40
41static inline void autogroup_kref_put(struct autogroup *ag)
42{
43 kref_put(&ag->kref, autogroup_destroy);
44}
45
46static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
47{
48 kref_get(&ag->kref);
49 return ag;
50}
51
52static inline struct autogroup *autogroup_task_get(struct task_struct *p)
53{
54 struct autogroup *ag;
55 unsigned long flags;
56
57 if (!lock_task_sighand(p, &flags))
58 return autogroup_kref_get(&autogroup_default);
59
60 ag = autogroup_kref_get(p->signal->autogroup);
61 unlock_task_sighand(p, &flags);
62
63 return ag;
64}
65
66static inline struct autogroup *autogroup_create(void)
67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
69 struct task_group *tg;
70
71 if (!ag)
72 goto out_fail;
73
74 tg = sched_create_group(&root_task_group);
75
76 if (IS_ERR(tg))
77 goto out_free;
78
79 kref_init(&ag->kref);
80 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr);
82 ag->tg = tg;
83#ifdef CONFIG_RT_GROUP_SCHED
84 /*
85 * Autogroup RT tasks are redirected to the root task group
86 * so we don't have to move tasks around upon policy change,
87 * or flail around trying to allocate bandwidth on the fly.
88 * A bandwidth exception in __sched_setscheduler() allows
89 * the policy change to proceed. Thereafter, task_group()
90 * returns &root_task_group, so zero bandwidth is required.
91 */
92 free_rt_sched_group(tg);
93 tg->rt_se = root_task_group.rt_se;
94 tg->rt_rq = root_task_group.rt_rq;
95#endif
96 tg->autogroup = ag;
97
98 return ag;
99
100out_free:
101 kfree(ag);
102out_fail:
103 if (printk_ratelimit()) {
104 printk(KERN_WARNING "autogroup_create: %s failure.\n",
105 ag ? "sched_create_group()" : "kmalloc()");
106 }
107
108 return autogroup_kref_get(&autogroup_default);
109}
110
111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112{
113 if (tg != &root_task_group)
114 return false;
115
116 if (p->sched_class != &fair_sched_class)
117 return false;
118
119 /*
120 * We can only assume the task group can't go away on us if
121 * autogroup_move_group() can see us on ->thread_group list.
122 */
123 if (p->flags & PF_EXITING)
124 return false;
125
126 return true;
127}
128
129static void
130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
131{
132 struct autogroup *prev;
133 struct task_struct *t;
134 unsigned long flags;
135
136 BUG_ON(!lock_task_sighand(p, &flags));
137
138 prev = p->signal->autogroup;
139 if (prev == ag) {
140 unlock_task_sighand(p, &flags);
141 return;
142 }
143
144 p->signal->autogroup = autogroup_kref_get(ag);
145
146 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
147 goto out;
148
149 t = p;
150 do {
151 sched_move_task(t);
152 } while_each_thread(p, t);
153
154out:
155 unlock_task_sighand(p, &flags);
156 autogroup_kref_put(prev);
157}
158
159/* Allocates GFP_KERNEL, cannot be called under any spinlock */
160void sched_autogroup_create_attach(struct task_struct *p)
161{
162 struct autogroup *ag = autogroup_create();
163
164 autogroup_move_group(p, ag);
165 /* drop extra reference added by autogroup_create() */
166 autogroup_kref_put(ag);
167}
168EXPORT_SYMBOL(sched_autogroup_create_attach);
169
170/* Cannot be called under siglock. Currently has no users */
171void sched_autogroup_detach(struct task_struct *p)
172{
173 autogroup_move_group(p, &autogroup_default);
174}
175EXPORT_SYMBOL(sched_autogroup_detach);
176
177void sched_autogroup_fork(struct signal_struct *sig)
178{
179 sig->autogroup = autogroup_task_get(current);
180}
181
182void sched_autogroup_exit(struct signal_struct *sig)
183{
184 autogroup_kref_put(sig->autogroup);
185}
186
187static int __init setup_autogroup(char *str)
188{
189 sysctl_sched_autogroup_enabled = 0;
190
191 return 1;
192}
193
194__setup("noautogroup", setup_autogroup);
195
196#ifdef CONFIG_PROC_FS
197
198int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
199{
200 static unsigned long next = INITIAL_JIFFIES;
201 struct autogroup *ag;
202 int err;
203
204 if (nice < -20 || nice > 19)
205 return -EINVAL;
206
207 err = security_task_setnice(current, nice);
208 if (err)
209 return err;
210
211 if (nice < 0 && !can_nice(current, nice))
212 return -EPERM;
213
214 /* this is a heavy operation taking global locks.. */
215 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
216 return -EAGAIN;
217
218 next = HZ / 10 + jiffies;
219 ag = autogroup_task_get(p);
220
221 down_write(&ag->lock);
222 err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
223 if (!err)
224 ag->nice = nice;
225 up_write(&ag->lock);
226
227 autogroup_kref_put(ag);
228
229 return err;
230}
231
232void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
233{
234 struct autogroup *ag = autogroup_task_get(p);
235
236 if (!task_group_is_autogroup(ag->tg))
237 goto out;
238
239 down_read(&ag->lock);
240 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
241 up_read(&ag->lock);
242
243out:
244 autogroup_kref_put(ag);
245}
246#endif /* CONFIG_PROC_FS */
247
248#ifdef CONFIG_SCHED_DEBUG
249int autogroup_path(struct task_group *tg, char *buf, int buflen)
250{
251 if (!task_group_is_autogroup(tg))
252 return 0;
253
254 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
255}
256#endif /* CONFIG_SCHED_DEBUG */
257
258#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
deleted file mode 100644
index 8bd04714281..00000000000
--- a/kernel/sched/auto_group.h
+++ /dev/null
@@ -1,64 +0,0 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
6struct autogroup {
7 /*
8 * reference doesn't mean how many thread attach to this
9 * autogroup now. It just stands for the number of task
10 * could use this autogroup.
11 */
12 struct kref kref;
13 struct task_group *tg;
14 struct rw_semaphore lock;
15 unsigned long id;
16 int nice;
17};
18
19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
29static inline struct task_group *
30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
41
42#else /* !CONFIG_SCHED_AUTOGROUP */
43
44static inline void autogroup_init(struct task_struct *init_task) { }
45static inline void autogroup_free(struct task_group *tg) { }
46static inline bool task_group_is_autogroup(struct task_group *tg)
47{
48 return 0;
49}
50
51static inline struct task_group *
52autogroup_task_group(struct task_struct *p, struct task_group *tg)
53{
54 return tg;
55}
56
57#ifdef CONFIG_SCHED_DEBUG
58static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
59{
60 return 0;
61}
62#endif
63
64#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
deleted file mode 100644
index c685e31492d..00000000000
--- a/kernel/sched/clock.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
9 * Based on code by:
10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com>
12 *
13 *
14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
43 * - sched_clock()
44 * - explicit idle events
45 *
46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
49 *
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped).
52 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */
63#include <linux/spinlock.h>
64#include <linux/hardirq.h>
65#include <linux/export.h>
66#include <linux/percpu.h>
67#include <linux/ktime.h>
68#include <linux/sched.h>
69
70/*
71 * Scheduler clock - returns current time in nanosec units.
72 * This is default implementation.
73 * Architectures and sub-architectures can override this.
74 */
75unsigned long long __attribute__((weak)) sched_clock(void)
76{
77 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
78 * (NSEC_PER_SEC / HZ);
79}
80EXPORT_SYMBOL_GPL(sched_clock);
81
82__read_mostly int sched_clock_running;
83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable;
86
87struct sched_clock_data {
88 u64 tick_raw;
89 u64 tick_gtod;
90 u64 clock;
91};
92
93static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
94
95static inline struct sched_clock_data *this_scd(void)
96{
97 return &__get_cpu_var(sched_clock_data);
98}
99
100static inline struct sched_clock_data *cpu_sdc(int cpu)
101{
102 return &per_cpu(sched_clock_data, cpu);
103}
104
105void sched_clock_init(void)
106{
107 u64 ktime_now = ktime_to_ns(ktime_get());
108 int cpu;
109
110 for_each_possible_cpu(cpu) {
111 struct sched_clock_data *scd = cpu_sdc(cpu);
112
113 scd->tick_raw = 0;
114 scd->tick_gtod = ktime_now;
115 scd->clock = ktime_now;
116 }
117
118 sched_clock_running = 1;
119}
120
121/*
122 * min, max except they take wrapping into account
123 */
124
125static inline u64 wrap_min(u64 x, u64 y)
126{
127 return (s64)(x - y) < 0 ? x : y;
128}
129
130static inline u64 wrap_max(u64 x, u64 y)
131{
132 return (s64)(x - y) > 0 ? x : y;
133}
134
135/*
136 * update the percpu scd from the raw @now value
137 *
138 * - filter out backward motion
139 * - use the GTOD tick value to create a window to filter crazy TSC values
140 */
141static u64 sched_clock_local(struct sched_clock_data *scd)
142{
143 u64 now, clock, old_clock, min_clock, max_clock;
144 s64 delta;
145
146again:
147 now = sched_clock();
148 delta = now - scd->tick_raw;
149 if (unlikely(delta < 0))
150 delta = 0;
151
152 old_clock = scd->clock;
153
154 /*
155 * scd->clock = clamp(scd->tick_gtod + delta,
156 * max(scd->tick_gtod, scd->clock),
157 * scd->tick_gtod + TICK_NSEC);
158 */
159
160 clock = scd->tick_gtod + delta;
161 min_clock = wrap_max(scd->tick_gtod, old_clock);
162 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
163
164 clock = wrap_max(clock, min_clock);
165 clock = wrap_min(clock, max_clock);
166
167 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
168 goto again;
169
170 return clock;
171}
172
173static u64 sched_clock_remote(struct sched_clock_data *scd)
174{
175 struct sched_clock_data *my_scd = this_scd();
176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val;
178
179 sched_clock_local(my_scd);
180again:
181 this_clock = my_scd->clock;
182 remote_clock = scd->clock;
183
184 /*
185 * Use the opportunity that we have both locks
186 * taken to couple the two clocks: we take the
187 * larger time as the latest time for both
188 * runqueues. (this creates monotonic movement)
189 */
190 if (likely((s64)(remote_clock - this_clock) < 0)) {
191 ptr = &scd->clock;
192 old_val = remote_clock;
193 val = this_clock;
194 } else {
195 /*
196 * Should be rare, but possible:
197 */
198 ptr = &my_scd->clock;
199 old_val = this_clock;
200 val = remote_clock;
201 }
202
203 if (cmpxchg64(ptr, old_val, val) != old_val)
204 goto again;
205
206 return val;
207}
208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
214u64 sched_clock_cpu(int cpu)
215{
216 struct sched_clock_data *scd;
217 u64 clock;
218
219 WARN_ON_ONCE(!irqs_disabled());
220
221 if (sched_clock_stable)
222 return sched_clock();
223
224 if (unlikely(!sched_clock_running))
225 return 0ull;
226
227 scd = cpu_sdc(cpu);
228
229 if (cpu != smp_processor_id())
230 clock = sched_clock_remote(scd);
231 else
232 clock = sched_clock_local(scd);
233
234 return clock;
235}
236
237void sched_clock_tick(void)
238{
239 struct sched_clock_data *scd;
240 u64 now, now_gtod;
241
242 if (sched_clock_stable)
243 return;
244
245 if (unlikely(!sched_clock_running))
246 return;
247
248 WARN_ON_ONCE(!irqs_disabled());
249
250 scd = this_scd();
251 now_gtod = ktime_to_ns(ktime_get());
252 now = sched_clock();
253
254 scd->tick_raw = now;
255 scd->tick_gtod = now_gtod;
256 sched_clock_local(scd);
257}
258
259/*
260 * We are going deep-idle (irqs are disabled):
261 */
262void sched_clock_idle_sleep_event(void)
263{
264 sched_clock_cpu(smp_processor_id());
265}
266EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
267
268/*
269 * We just idled delta nanoseconds (called with irqs disabled):
270 */
271void sched_clock_idle_wakeup_event(u64 delta_ns)
272{
273 if (timekeeping_suspended)
274 return;
275
276 sched_clock_tick();
277 touch_softlockup_watchdog();
278}
279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
280
281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
292{
293 u64 clock;
294 unsigned long flags;
295
296 local_irq_save(flags);
297 clock = sched_clock_cpu(cpu);
298 local_irq_restore(flags);
299
300 return clock;
301}
302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
323
324void sched_clock_init(void)
325{
326 sched_clock_running = 1;
327}
328
329u64 sched_clock_cpu(int cpu)
330{
331 if (unlikely(!sched_clock_running))
332 return 0;
333
334 return sched_clock();
335}
336
337u64 cpu_clock(int cpu)
338{
339 return sched_clock_cpu(cpu);
340}
341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
348
349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
deleted file mode 100644
index 257002c13bb..00000000000
--- a/kernel/sched/core.c
+++ /dev/null
@@ -1,8162 +0,0 @@
1/*
2 * kernel/sched/core.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
27 */
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_sched.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129/*
130 * Debugging: various feature bits
131 */
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif /* HAVE_JUMP_LABEL */
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif /* CONFIG_SCHED_DEBUG */
270
271/*
272 * Number of tasks to iterate in a single balance run.
273 * Limited because this is done with IRQs disabled.
274 */
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277/*
278 * period over which we average the RT time consumption, measured
279 * in ms.
280 *
281 * default: 1s
282 */
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285/*
286 * period over which we measure -rt task cpu usage in us.
287 * default: 1s
288 */
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293/*
294 * part of the period that we allow rt tasks to run in us.
295 * default: 0.95s
296 */
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301/*
302 * __task_rq_lock - lock the rq @p resides on.
303 */
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320/*
321 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
322 */
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355/*
356 * this_rq_lock - lock this runqueue and disable interrupts.
357 */
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371/*
372 * Use HR-timers to deliver accurate preemption points.
373 *
374 * Its all a bit involved since we cannot program an hrt while holding the
375 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
376 * reschedule event.
377 *
378 * When we get rescheduled we reprogram the hrtick_timer outside of the
379 * rq->lock.
380 */
381
382static void hrtick_clear(struct rq *rq)
383{
384 if (hrtimer_active(&rq->hrtick_timer))
385 hrtimer_cancel(&rq->hrtick_timer);
386}
387
388/*
389 * High-resolution timer tick.
390 * Runs from hardirq context with interrupts disabled.
391 */
392static enum hrtimer_restart hrtick(struct hrtimer *timer)
393{
394 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
395
396 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
397
398 raw_spin_lock(&rq->lock);
399 update_rq_clock(rq);
400 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
401 raw_spin_unlock(&rq->lock);
402
403 return HRTIMER_NORESTART;
404}
405
406#ifdef CONFIG_SMP
407/*
408 * called from hardirq (IPI) context
409 */
410static void __hrtick_start(void *arg)
411{
412 struct rq *rq = arg;
413
414 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer);
416 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock);
418}
419
420/*
421 * Called to set the hrtick timer state.
422 *
423 * called with rq->lock held and irqs disabled
424 */
425void hrtick_start(struct rq *rq, u64 delay)
426{
427 struct hrtimer *timer = &rq->hrtick_timer;
428 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430 hrtimer_set_expires(timer, time);
431
432 if (rq == this_rq()) {
433 hrtimer_restart(timer);
434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1;
437 }
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443 int cpu = (int)(long)hcpu;
444
445 switch (action) {
446 case CPU_UP_CANCELED:
447 case CPU_UP_CANCELED_FROZEN:
448 case CPU_DOWN_PREPARE:
449 case CPU_DOWN_PREPARE_FROZEN:
450 case CPU_DEAD:
451 case CPU_DEAD_FROZEN:
452 hrtick_clear(cpu_rq(cpu));
453 return NOTIFY_OK;
454 }
455
456 return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461 hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464/*
465 * Called to set the hrtick timer state.
466 *
467 * called with rq->lock held and irqs disabled
468 */
469void hrtick_start(struct rq *rq, u64 delay)
470{
471 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472 HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif /* CONFIG_SMP */
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483 rq->hrtick_csd_pending = 0;
484
485 rq->hrtick_csd.flags = 0;
486 rq->hrtick_csd.func = __hrtick_start;
487 rq->hrtick_csd.info = rq;
488#endif
489
490 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491 rq->hrtick_timer.function = hrtick;
492}
493#else /* CONFIG_SCHED_HRTICK */
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif /* CONFIG_SCHED_HRTICK */
506
507/*
508 * resched_task - mark a task 'to be rescheduled now'.
509 *
510 * On UP this means the setting of the need_resched flag, on SMP it
511 * might also involve a cross-CPU call to trigger the scheduler on
512 * the target CPU.
513 */
514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p)
521{
522 int cpu;
523
524 assert_raw_spin_locked(&task_rq(p)->lock);
525
526 if (test_tsk_need_resched(p))
527 return;
528
529 set_tsk_need_resched(p);
530
531 cpu = task_cpu(p);
532 if (cpu == smp_processor_id())
533 return;
534
535 /* NEED_RESCHED must be visible before we test polling */
536 smp_mb();
537 if (!tsk_is_polling(p))
538 smp_send_reschedule(cpu);
539}
540
541void resched_cpu(int cpu)
542{
543 struct rq *rq = cpu_rq(cpu);
544 unsigned long flags;
545
546 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
547 return;
548 resched_task(cpu_curr(cpu));
549 raw_spin_unlock_irqrestore(&rq->lock, flags);
550}
551
552#ifdef CONFIG_NO_HZ
553/*
554 * In the semi idle case, use the nearest busy cpu for migrating timers
555 * from an idle cpu. This is good for power-savings.
556 *
557 * We don't do similar optimization for completely idle system, as
558 * selecting an idle cpu will add more delays to the timers than intended
559 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
560 */
561int get_nohz_timer_target(void)
562{
563 int cpu = smp_processor_id();
564 int i;
565 struct sched_domain *sd;
566
567 rcu_read_lock();
568 for_each_domain(cpu, sd) {
569 for_each_cpu(i, sched_domain_span(sd)) {
570 if (!idle_cpu(i)) {
571 cpu = i;
572 goto unlock;
573 }
574 }
575 }
576unlock:
577 rcu_read_unlock();
578 return cpu;
579}
580/*
581 * When add_timer_on() enqueues a timer into the timer wheel of an
582 * idle CPU then this timer might expire before the next timer event
583 * which is scheduled to wake up that CPU. In case of a completely
584 * idle system the next event might even be infinite time into the
585 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
586 * leaves the inner idle loop so the newly added timer is taken into
587 * account when the CPU goes back to idle and evaluates the timer
588 * wheel for the next timer event.
589 */
590void wake_up_idle_cpu(int cpu)
591{
592 struct rq *rq = cpu_rq(cpu);
593
594 if (cpu == smp_processor_id())
595 return;
596
597 /*
598 * This is safe, as this function is called with the timer
599 * wheel base lock of (cpu) held. When the CPU is on the way
600 * to idle and has not yet set rq->curr to idle then it will
601 * be serialized on the timer wheel base lock and take the new
602 * timer into account automatically.
603 */
604 if (rq->curr != rq->idle)
605 return;
606
607 /*
608 * We can set TIF_RESCHED on the idle task of the other CPU
609 * lockless. The worst case is that the other CPU runs the
610 * idle task through an additional NOOP schedule()
611 */
612 set_tsk_need_resched(rq->idle);
613
614 /* NEED_RESCHED must be visible before we test polling */
615 smp_mb();
616 if (!tsk_is_polling(rq->idle))
617 smp_send_reschedule(cpu);
618}
619
620static inline bool got_nohz_idle_kick(void)
621{
622 int cpu = smp_processor_id();
623 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624}
625
626#else /* CONFIG_NO_HZ */
627
628static inline bool got_nohz_idle_kick(void)
629{
630 return false;
631}
632
633#endif /* CONFIG_NO_HZ */
634
635void sched_avg_update(struct rq *rq)
636{
637 s64 period = sched_avg_period();
638
639 while ((s64)(rq->clock - rq->age_stamp) > period) {
640 /*
641 * Inline assembly required to prevent the compiler
642 * optimising this loop into a divmod call.
643 * See __iter_div_u64_rem() for another example of this.
644 */
645 asm("" : "+rm" (rq->age_stamp));
646 rq->age_stamp += period;
647 rq->rt_avg /= 2;
648 }
649}
650
651#else /* !CONFIG_SMP */
652void resched_task(struct task_struct *p)
653{
654 assert_raw_spin_locked(&task_rq(p)->lock);
655 set_tsk_need_resched(p);
656}
657#endif /* CONFIG_SMP */
658
659#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
660 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
661/*
662 * Iterate task_group tree rooted at *from, calling @down when first entering a
663 * node and @up when leaving it for the final time.
664 *
665 * Caller must hold rcu_lock or sufficient equivalent.
666 */
667int walk_tg_tree_from(struct task_group *from,
668 tg_visitor down, tg_visitor up, void *data)
669{
670 struct task_group *parent, *child;
671 int ret;
672
673 parent = from;
674
675down:
676 ret = (*down)(parent, data);
677 if (ret)
678 goto out;
679 list_for_each_entry_rcu(child, &parent->children, siblings) {
680 parent = child;
681 goto down;
682
683up:
684 continue;
685 }
686 ret = (*up)(parent, data);
687 if (ret || parent == from)
688 goto out;
689
690 child = parent;
691 parent = parent->parent;
692 if (parent)
693 goto up;
694out:
695 return ret;
696}
697
698int tg_nop(struct task_group *tg, void *data)
699{
700 return 0;
701}
702#endif
703
704static void set_load_weight(struct task_struct *p)
705{
706 int prio = p->static_prio - MAX_RT_PRIO;
707 struct load_weight *load = &p->se.load;
708
709 /*
710 * SCHED_IDLE tasks get minimal weight:
711 */
712 if (p->policy == SCHED_IDLE) {
713 load->weight = scale_load(WEIGHT_IDLEPRIO);
714 load->inv_weight = WMULT_IDLEPRIO;
715 return;
716 }
717
718 load->weight = scale_load(prio_to_weight[prio]);
719 load->inv_weight = prio_to_wmult[prio];
720}
721
722static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
723{
724 update_rq_clock(rq);
725 sched_info_queued(p);
726 p->sched_class->enqueue_task(rq, p, flags);
727}
728
729static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
730{
731 update_rq_clock(rq);
732 sched_info_dequeued(p);
733 p->sched_class->dequeue_task(rq, p, flags);
734}
735
736void activate_task(struct rq *rq, struct task_struct *p, int flags)
737{
738 if (task_contributes_to_load(p))
739 rq->nr_uninterruptible--;
740
741 enqueue_task(rq, p, flags);
742}
743
744void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
745{
746 if (task_contributes_to_load(p))
747 rq->nr_uninterruptible++;
748
749 dequeue_task(rq, p, flags);
750}
751
752static void update_rq_clock_task(struct rq *rq, s64 delta)
753{
754/*
755 * In theory, the compile should just see 0 here, and optimize out the call
756 * to sched_rt_avg_update. But I don't trust it...
757 */
758#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
759 s64 steal = 0, irq_delta = 0;
760#endif
761#ifdef CONFIG_IRQ_TIME_ACCOUNTING
762 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
763
764 /*
765 * Since irq_time is only updated on {soft,}irq_exit, we might run into
766 * this case when a previous update_rq_clock() happened inside a
767 * {soft,}irq region.
768 *
769 * When this happens, we stop ->clock_task and only update the
770 * prev_irq_time stamp to account for the part that fit, so that a next
771 * update will consume the rest. This ensures ->clock_task is
772 * monotonic.
773 *
774 * It does however cause some slight miss-attribution of {soft,}irq
775 * time, a more accurate solution would be to update the irq_time using
776 * the current rq->clock timestamp, except that would require using
777 * atomic ops.
778 */
779 if (irq_delta > delta)
780 irq_delta = delta;
781
782 rq->prev_irq_time += irq_delta;
783 delta -= irq_delta;
784#endif
785#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
786 if (static_key_false((&paravirt_steal_rq_enabled))) {
787 u64 st;
788
789 steal = paravirt_steal_clock(cpu_of(rq));
790 steal -= rq->prev_steal_time_rq;
791
792 if (unlikely(steal > delta))
793 steal = delta;
794
795 st = steal_ticks(steal);
796 steal = st * TICK_NSEC;
797
798 rq->prev_steal_time_rq += steal;
799
800 delta -= steal;
801 }
802#endif
803
804 rq->clock_task += delta;
805
806#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
807 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
808 sched_rt_avg_update(rq, irq_delta + steal);
809#endif
810}
811
812void sched_set_stop_task(int cpu, struct task_struct *stop)
813{
814 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
815 struct task_struct *old_stop = cpu_rq(cpu)->stop;
816
817 if (stop) {
818 /*
819 * Make it appear like a SCHED_FIFO task, its something
820 * userspace knows about and won't get confused about.
821 *
822 * Also, it will make PI more or less work without too
823 * much confusion -- but then, stop work should not
824 * rely on PI working anyway.
825 */
826 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
827
828 stop->sched_class = &stop_sched_class;
829 }
830
831 cpu_rq(cpu)->stop = stop;
832
833 if (old_stop) {
834 /*
835 * Reset it back to a normal scheduling class so that
836 * it can die in pieces.
837 */
838 old_stop->sched_class = &rt_sched_class;
839 }
840}
841
842/*
843 * __normal_prio - return the priority that is based on the static prio
844 */
845static inline int __normal_prio(struct task_struct *p)
846{
847 return p->static_prio;
848}
849
850/*
851 * Calculate the expected normal priority: i.e. priority
852 * without taking RT-inheritance into account. Might be
853 * boosted by interactivity modifiers. Changes upon fork,
854 * setprio syscalls, and whenever the interactivity
855 * estimator recalculates.
856 */
857static inline int normal_prio(struct task_struct *p)
858{
859 int prio;
860
861 if (task_has_rt_policy(p))
862 prio = MAX_RT_PRIO-1 - p->rt_priority;
863 else
864 prio = __normal_prio(p);
865 return prio;
866}
867
868/*
869 * Calculate the current priority, i.e. the priority
870 * taken into account by the scheduler. This value might
871 * be boosted by RT tasks, or might be boosted by
872 * interactivity modifiers. Will be RT if the task got
873 * RT-boosted. If not then it returns p->normal_prio.
874 */
875static int effective_prio(struct task_struct *p)
876{
877 p->normal_prio = normal_prio(p);
878 /*
879 * If we are RT tasks or we were boosted to RT priority,
880 * keep the priority unchanged. Otherwise, update priority
881 * to the normal priority:
882 */
883 if (!rt_prio(p->prio))
884 return p->normal_prio;
885 return p->prio;
886}
887
888/**
889 * task_curr - is this task currently executing on a CPU?
890 * @p: the task in question.
891 */
892inline int task_curr(const struct task_struct *p)
893{
894 return cpu_curr(task_cpu(p)) == p;
895}
896
897static inline void check_class_changed(struct rq *rq, struct task_struct *p,
898 const struct sched_class *prev_class,
899 int oldprio)
900{
901 if (prev_class != p->sched_class) {
902 if (prev_class->switched_from)
903 prev_class->switched_from(rq, p);
904 p->sched_class->switched_to(rq, p);
905 } else if (oldprio != p->prio)
906 p->sched_class->prio_changed(rq, p, oldprio);
907}
908
909void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
910{
911 const struct sched_class *class;
912
913 if (p->sched_class == rq->curr->sched_class) {
914 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
915 } else {
916 for_each_class(class) {
917 if (class == rq->curr->sched_class)
918 break;
919 if (class == p->sched_class) {
920 resched_task(rq->curr);
921 break;
922 }
923 }
924 }
925
926 /*
927 * A queue event has occurred, and we're going to schedule. In
928 * this case, we can save a useless back to back clock update.
929 */
930 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
931 rq->skip_clock_update = 1;
932}
933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
941#ifdef CONFIG_SMP
942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
943{
944#ifdef CONFIG_SCHED_DEBUG
945 /*
946 * We should never call set_task_cpu() on a blocked task,
947 * ttwu() will sort out the placement.
948 */
949 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
950 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
951
952#ifdef CONFIG_LOCKDEP
953 /*
954 * The caller should hold either p->pi_lock or rq->lock, when changing
955 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
956 *
957 * sched_move_task() holds both and thus holding either pins the cgroup,
958 * see task_group().
959 *
960 * Furthermore, all task_rq users should acquire both locks, see
961 * task_rq_lock().
962 */
963 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
964 lockdep_is_held(&task_rq(p)->lock)));
965#endif
966#endif
967
968 trace_sched_migrate_task(p, new_cpu);
969
970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
975 p->se.nr_migrations++;
976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
983 }
984
985 __set_task_cpu(p, new_cpu);
986}
987
988struct migration_arg {
989 struct task_struct *task;
990 int dest_cpu;
991};
992
993static int migration_cpu_stop(void *data);
994
995/*
996 * wait_task_inactive - wait for a thread to unschedule.
997 *
998 * If @match_state is nonzero, it's the @p->state value just checked and
999 * not expected to change. If it changes, i.e. @p might have woken up,
1000 * then return zero. When we succeed in waiting for @p to be off its CPU,
1001 * we return a positive number (its total switch count). If a second call
1002 * a short while later returns the same number, the caller can be sure that
1003 * @p has remained unscheduled the whole time.
1004 *
1005 * The caller must ensure that the task *will* unschedule sometime soon,
1006 * else this function might spin for a *long* time. This function can't
1007 * be called with interrupts off, or it may introduce deadlock with
1008 * smp_call_function() if an IPI is sent by the same process we are
1009 * waiting to become inactive.
1010 */
1011unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1012{
1013 unsigned long flags;
1014 int running, on_rq;
1015 unsigned long ncsw;
1016 struct rq *rq;
1017
1018 for (;;) {
1019 /*
1020 * We do the initial early heuristics without holding
1021 * any task-queue locks at all. We'll only try to get
1022 * the runqueue lock when things look like they will
1023 * work out!
1024 */
1025 rq = task_rq(p);
1026
1027 /*
1028 * If the task is actively running on another CPU
1029 * still, just relax and busy-wait without holding
1030 * any locks.
1031 *
1032 * NOTE! Since we don't hold any locks, it's not
1033 * even sure that "rq" stays as the right runqueue!
1034 * But we don't care, since "task_running()" will
1035 * return false if the runqueue has changed and p
1036 * is actually now running somewhere else!
1037 */
1038 while (task_running(rq, p)) {
1039 if (match_state && unlikely(p->state != match_state))
1040 return 0;
1041 cpu_relax();
1042 }
1043
1044 /*
1045 * Ok, time to look more closely! We need the rq
1046 * lock now, to be *sure*. If we're wrong, we'll
1047 * just go back and repeat.
1048 */
1049 rq = task_rq_lock(p, &flags);
1050 trace_sched_wait_task(p);
1051 running = task_running(rq, p);
1052 on_rq = p->on_rq;
1053 ncsw = 0;
1054 if (!match_state || p->state == match_state)
1055 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1056 task_rq_unlock(rq, p, &flags);
1057
1058 /*
1059 * If it changed from the expected state, bail out now.
1060 */
1061 if (unlikely(!ncsw))
1062 break;
1063
1064 /*
1065 * Was it really running after all now that we
1066 * checked with the proper locks actually held?
1067 *
1068 * Oops. Go back and try again..
1069 */
1070 if (unlikely(running)) {
1071 cpu_relax();
1072 continue;
1073 }
1074
1075 /*
1076 * It's not enough that it's not actively running,
1077 * it must be off the runqueue _entirely_, and not
1078 * preempted!
1079 *
1080 * So if it was still runnable (but just not actively
1081 * running right now), it's preempted, and we should
1082 * yield - it could be a while.
1083 */
1084 if (unlikely(on_rq)) {
1085 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1086
1087 set_current_state(TASK_UNINTERRUPTIBLE);
1088 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1089 continue;
1090 }
1091
1092 /*
1093 * Ahh, all good. It wasn't running, and it wasn't
1094 * runnable, which means that it will never become
1095 * running in the future either. We're all done!
1096 */
1097 break;
1098 }
1099
1100 return ncsw;
1101}
1102
1103/***
1104 * kick_process - kick a running thread to enter/exit the kernel
1105 * @p: the to-be-kicked thread
1106 *
1107 * Cause a process which is running on another CPU to enter
1108 * kernel-mode, without any delay. (to get signals handled.)
1109 *
1110 * NOTE: this function doesn't have to take the runqueue lock,
1111 * because all it wants to ensure is that the remote task enters
1112 * the kernel. If the IPI races and the task has been migrated
1113 * to another CPU then no harm is done and the purpose has been
1114 * achieved as well.
1115 */
1116void kick_process(struct task_struct *p)
1117{
1118 int cpu;
1119
1120 preempt_disable();
1121 cpu = task_cpu(p);
1122 if ((cpu != smp_processor_id()) && task_curr(p))
1123 smp_send_reschedule(cpu);
1124 preempt_enable();
1125}
1126EXPORT_SYMBOL_GPL(kick_process);
1127#endif /* CONFIG_SMP */
1128
1129#ifdef CONFIG_SMP
1130/*
1131 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1136 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu;
1138
1139 /* Look for allowed, online CPU in same node. */
1140 for_each_cpu(dest_cpu, nodemask) {
1141 if (!cpu_online(dest_cpu))
1142 continue;
1143 if (!cpu_active(dest_cpu))
1144 continue;
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1146 return dest_cpu;
1147 }
1148
1149 for (;;) {
1150 /* Any allowed, online CPU? */
1151 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1152 if (!cpu_online(dest_cpu))
1153 continue;
1154 if (!cpu_active(dest_cpu))
1155 continue;
1156 goto out;
1157 }
1158
1159 switch (state) {
1160 case cpuset:
1161 /* No more Mr. Nice Guy. */
1162 cpuset_cpus_allowed_fallback(p);
1163 state = possible;
1164 break;
1165
1166 case possible:
1167 do_set_cpus_allowed(p, cpu_possible_mask);
1168 state = fail;
1169 break;
1170
1171 case fail:
1172 BUG();
1173 break;
1174 }
1175 }
1176
1177out:
1178 if (state != cpuset) {
1179 /*
1180 * Don't tell them about moving exiting tasks or
1181 * kernel threads (both mm NULL), since they never
1182 * leave kernel.
1183 */
1184 if (p->mm && printk_ratelimit()) {
1185 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1186 task_pid_nr(p), p->comm, cpu);
1187 }
1188 }
1189
1190 return dest_cpu;
1191}
1192
1193/*
1194 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1195 */
1196static inline
1197int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1198{
1199 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1200
1201 /*
1202 * In order not to call set_task_cpu() on a blocking task we need
1203 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1204 * cpu.
1205 *
1206 * Since this is common to all placement strategies, this lives here.
1207 *
1208 * [ this allows ->select_task() to simply return task_cpu(p) and
1209 * not worry about this generic constraint ]
1210 */
1211 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1212 !cpu_online(cpu)))
1213 cpu = select_fallback_rq(task_cpu(p), p);
1214
1215 return cpu;
1216}
1217
1218static void update_avg(u64 *avg, u64 sample)
1219{
1220 s64 diff = sample - *avg;
1221 *avg += diff >> 3;
1222}
1223#endif
1224
1225static void
1226ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1227{
1228#ifdef CONFIG_SCHEDSTATS
1229 struct rq *rq = this_rq();
1230
1231#ifdef CONFIG_SMP
1232 int this_cpu = smp_processor_id();
1233
1234 if (cpu == this_cpu) {
1235 schedstat_inc(rq, ttwu_local);
1236 schedstat_inc(p, se.statistics.nr_wakeups_local);
1237 } else {
1238 struct sched_domain *sd;
1239
1240 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1241 rcu_read_lock();
1242 for_each_domain(this_cpu, sd) {
1243 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1244 schedstat_inc(sd, ttwu_wake_remote);
1245 break;
1246 }
1247 }
1248 rcu_read_unlock();
1249 }
1250
1251 if (wake_flags & WF_MIGRATED)
1252 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1253
1254#endif /* CONFIG_SMP */
1255
1256 schedstat_inc(rq, ttwu_count);
1257 schedstat_inc(p, se.statistics.nr_wakeups);
1258
1259 if (wake_flags & WF_SYNC)
1260 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1261
1262#endif /* CONFIG_SCHEDSTATS */
1263}
1264
1265static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1266{
1267 activate_task(rq, p, en_flags);
1268 p->on_rq = 1;
1269
1270 /* if a worker is waking up, notify workqueue */
1271 if (p->flags & PF_WQ_WORKER)
1272 wq_worker_waking_up(p, cpu_of(rq));
1273}
1274
1275/*
1276 * Mark the task runnable and perform wakeup-preemption.
1277 */
1278static void
1279ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1280{
1281 trace_sched_wakeup(p, true);
1282 check_preempt_curr(rq, p, wake_flags);
1283
1284 p->state = TASK_RUNNING;
1285#ifdef CONFIG_SMP
1286 if (p->sched_class->task_woken)
1287 p->sched_class->task_woken(rq, p);
1288
1289 if (rq->idle_stamp) {
1290 u64 delta = rq->clock - rq->idle_stamp;
1291 u64 max = 2*sysctl_sched_migration_cost;
1292
1293 if (delta > max)
1294 rq->avg_idle = max;
1295 else
1296 update_avg(&rq->avg_idle, delta);
1297 rq->idle_stamp = 0;
1298 }
1299#endif
1300}
1301
1302static void
1303ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1304{
1305#ifdef CONFIG_SMP
1306 if (p->sched_contributes_to_load)
1307 rq->nr_uninterruptible--;
1308#endif
1309
1310 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1311 ttwu_do_wakeup(rq, p, wake_flags);
1312}
1313
1314/*
1315 * Called in case the task @p isn't fully descheduled from its runqueue,
1316 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1317 * since all we need to do is flip p->state to TASK_RUNNING, since
1318 * the task is still ->on_rq.
1319 */
1320static int ttwu_remote(struct task_struct *p, int wake_flags)
1321{
1322 struct rq *rq;
1323 int ret = 0;
1324
1325 rq = __task_rq_lock(p);
1326 if (p->on_rq) {
1327 ttwu_do_wakeup(rq, p, wake_flags);
1328 ret = 1;
1329 }
1330 __task_rq_unlock(rq);
1331
1332 return ret;
1333}
1334
1335#ifdef CONFIG_SMP
1336static void sched_ttwu_pending(void)
1337{
1338 struct rq *rq = this_rq();
1339 struct llist_node *llist = llist_del_all(&rq->wake_list);
1340 struct task_struct *p;
1341
1342 raw_spin_lock(&rq->lock);
1343
1344 while (llist) {
1345 p = llist_entry(llist, struct task_struct, wake_entry);
1346 llist = llist_next(llist);
1347 ttwu_do_activate(rq, p, 0);
1348 }
1349
1350 raw_spin_unlock(&rq->lock);
1351}
1352
1353void scheduler_ipi(void)
1354{
1355 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1356 return;
1357
1358 /*
1359 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1360 * traditionally all their work was done from the interrupt return
1361 * path. Now that we actually do some work, we need to make sure
1362 * we do call them.
1363 *
1364 * Some archs already do call them, luckily irq_enter/exit nest
1365 * properly.
1366 *
1367 * Arguably we should visit all archs and update all handlers,
1368 * however a fair share of IPIs are still resched only so this would
1369 * somewhat pessimize the simple resched case.
1370 */
1371 irq_enter();
1372 sched_ttwu_pending();
1373
1374 /*
1375 * Check if someone kicked us for doing the nohz idle load balance.
1376 */
1377 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1378 this_rq()->idle_balance = 1;
1379 raise_softirq_irqoff(SCHED_SOFTIRQ);
1380 }
1381 irq_exit();
1382}
1383
1384static void ttwu_queue_remote(struct task_struct *p, int cpu)
1385{
1386 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1387 smp_send_reschedule(cpu);
1388}
1389
1390bool cpus_share_cache(int this_cpu, int that_cpu)
1391{
1392 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1393}
1394#endif /* CONFIG_SMP */
1395
1396static void ttwu_queue(struct task_struct *p, int cpu)
1397{
1398 struct rq *rq = cpu_rq(cpu);
1399
1400#if defined(CONFIG_SMP)
1401 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1402 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1403 ttwu_queue_remote(p, cpu);
1404 return;
1405 }
1406#endif
1407
1408 raw_spin_lock(&rq->lock);
1409 ttwu_do_activate(rq, p, 0);
1410 raw_spin_unlock(&rq->lock);
1411}
1412
1413/**
1414 * try_to_wake_up - wake up a thread
1415 * @p: the thread to be awakened
1416 * @state: the mask of task states that can be woken
1417 * @wake_flags: wake modifier flags (WF_*)
1418 *
1419 * Put it on the run-queue if it's not already there. The "current"
1420 * thread is always on the run-queue (except when the actual
1421 * re-schedule is in progress), and as such you're allowed to do
1422 * the simpler "current->state = TASK_RUNNING" to mark yourself
1423 * runnable without the overhead of this.
1424 *
1425 * Returns %true if @p was woken up, %false if it was already running
1426 * or @state didn't match @p's state.
1427 */
1428static int
1429try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1430{
1431 unsigned long flags;
1432 int cpu, success = 0;
1433
1434 smp_wmb();
1435 raw_spin_lock_irqsave(&p->pi_lock, flags);
1436 if (!(p->state & state))
1437 goto out;
1438
1439 success = 1; /* we're going to change ->state */
1440 cpu = task_cpu(p);
1441
1442 if (p->on_rq && ttwu_remote(p, wake_flags))
1443 goto stat;
1444
1445#ifdef CONFIG_SMP
1446 /*
1447 * If the owning (remote) cpu is still in the middle of schedule() with
1448 * this task as prev, wait until its done referencing the task.
1449 */
1450 while (p->on_cpu)
1451 cpu_relax();
1452 /*
1453 * Pairs with the smp_wmb() in finish_lock_switch().
1454 */
1455 smp_rmb();
1456
1457 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1458 p->state = TASK_WAKING;
1459
1460 if (p->sched_class->task_waking)
1461 p->sched_class->task_waking(p);
1462
1463 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1464 if (task_cpu(p) != cpu) {
1465 wake_flags |= WF_MIGRATED;
1466 set_task_cpu(p, cpu);
1467 }
1468#endif /* CONFIG_SMP */
1469
1470 ttwu_queue(p, cpu);
1471stat:
1472 ttwu_stat(p, cpu, wake_flags);
1473out:
1474 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1475
1476 return success;
1477}
1478
1479/**
1480 * try_to_wake_up_local - try to wake up a local task with rq lock held
1481 * @p: the thread to be awakened
1482 *
1483 * Put @p on the run-queue if it's not already there. The caller must
1484 * ensure that this_rq() is locked, @p is bound to this_rq() and not
1485 * the current task.
1486 */
1487static void try_to_wake_up_local(struct task_struct *p)
1488{
1489 struct rq *rq = task_rq(p);
1490
1491 BUG_ON(rq != this_rq());
1492 BUG_ON(p == current);
1493 lockdep_assert_held(&rq->lock);
1494
1495 if (!raw_spin_trylock(&p->pi_lock)) {
1496 raw_spin_unlock(&rq->lock);
1497 raw_spin_lock(&p->pi_lock);
1498 raw_spin_lock(&rq->lock);
1499 }
1500
1501 if (!(p->state & TASK_NORMAL))
1502 goto out;
1503
1504 if (!p->on_rq)
1505 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1506
1507 ttwu_do_wakeup(rq, p, 0);
1508 ttwu_stat(p, smp_processor_id(), 0);
1509out:
1510 raw_spin_unlock(&p->pi_lock);
1511}
1512
1513/**
1514 * wake_up_process - Wake up a specific process
1515 * @p: The process to be woken up.
1516 *
1517 * Attempt to wake up the nominated process and move it to the set of runnable
1518 * processes. Returns 1 if the process was woken up, 0 if it was already
1519 * running.
1520 *
1521 * It may be assumed that this function implies a write memory barrier before
1522 * changing the task state if and only if any tasks are woken up.
1523 */
1524int wake_up_process(struct task_struct *p)
1525{
1526 return try_to_wake_up(p, TASK_ALL, 0);
1527}
1528EXPORT_SYMBOL(wake_up_process);
1529
1530int wake_up_state(struct task_struct *p, unsigned int state)
1531{
1532 return try_to_wake_up(p, state, 0);
1533}
1534
1535/*
1536 * Perform scheduler related setup for a newly forked process p.
1537 * p is forked by current.
1538 *
1539 * __sched_fork() is basic setup used by init_idle() too:
1540 */
1541static void __sched_fork(struct task_struct *p)
1542{
1543 p->on_rq = 0;
1544
1545 p->se.on_rq = 0;
1546 p->se.exec_start = 0;
1547 p->se.sum_exec_runtime = 0;
1548 p->se.prev_sum_exec_runtime = 0;
1549 p->se.nr_migrations = 0;
1550 p->se.vruntime = 0;
1551 INIT_LIST_HEAD(&p->se.group_node);
1552
1553/*
1554 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1555 * removed when useful for applications beyond shares distribution (e.g.
1556 * load-balance).
1557 */
1558#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1559 p->se.avg.runnable_avg_period = 0;
1560 p->se.avg.runnable_avg_sum = 0;
1561#endif
1562#ifdef CONFIG_SCHEDSTATS
1563 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1564#endif
1565
1566 INIT_LIST_HEAD(&p->rt.run_list);
1567
1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1585}
1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1605
1606/*
1607 * fork()/clone()-time setup:
1608 */
1609void sched_fork(struct task_struct *p)
1610{
1611 unsigned long flags;
1612 int cpu = get_cpu();
1613
1614 __sched_fork(p);
1615 /*
1616 * We mark the process as running here. This guarantees that
1617 * nobody will actually run it, and a signal or other external
1618 * event cannot wake it up and insert it on the runqueue either.
1619 */
1620 p->state = TASK_RUNNING;
1621
1622 /*
1623 * Make sure we do not leak PI boosting priority to the child.
1624 */
1625 p->prio = current->normal_prio;
1626
1627 /*
1628 * Revert to default priority/policy on fork if requested.
1629 */
1630 if (unlikely(p->sched_reset_on_fork)) {
1631 if (task_has_rt_policy(p)) {
1632 p->policy = SCHED_NORMAL;
1633 p->static_prio = NICE_TO_PRIO(0);
1634 p->rt_priority = 0;
1635 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1636 p->static_prio = NICE_TO_PRIO(0);
1637
1638 p->prio = p->normal_prio = __normal_prio(p);
1639 set_load_weight(p);
1640
1641 /*
1642 * We don't need the reset flag anymore after the fork. It has
1643 * fulfilled its duty:
1644 */
1645 p->sched_reset_on_fork = 0;
1646 }
1647
1648 if (!rt_prio(p->prio))
1649 p->sched_class = &fair_sched_class;
1650
1651 if (p->sched_class->task_fork)
1652 p->sched_class->task_fork(p);
1653
1654 /*
1655 * The child is not yet in the pid-hash so no cgroup attach races,
1656 * and the cgroup is pinned to this child due to cgroup_fork()
1657 * is ran before sched_fork().
1658 *
1659 * Silence PROVE_RCU.
1660 */
1661 raw_spin_lock_irqsave(&p->pi_lock, flags);
1662 set_task_cpu(p, cpu);
1663 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1664
1665#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1666 if (likely(sched_info_on()))
1667 memset(&p->sched_info, 0, sizeof(p->sched_info));
1668#endif
1669#if defined(CONFIG_SMP)
1670 p->on_cpu = 0;
1671#endif
1672#ifdef CONFIG_PREEMPT_COUNT
1673 /* Want to start with kernel preemption disabled. */
1674 task_thread_info(p)->preempt_count = 1;
1675#endif
1676#ifdef CONFIG_SMP
1677 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1678#endif
1679
1680 put_cpu();
1681}
1682
1683/*
1684 * wake_up_new_task - wake up a newly created task for the first time.
1685 *
1686 * This function will do some initial scheduler statistics housekeeping
1687 * that must be done for every newly created context, then puts the task
1688 * on the runqueue and wakes it.
1689 */
1690void wake_up_new_task(struct task_struct *p)
1691{
1692 unsigned long flags;
1693 struct rq *rq;
1694
1695 raw_spin_lock_irqsave(&p->pi_lock, flags);
1696#ifdef CONFIG_SMP
1697 /*
1698 * Fork balancing, do it here and not earlier because:
1699 * - cpus_allowed can change in the fork path
1700 * - any previously selected cpu might disappear through hotplug
1701 */
1702 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1703#endif
1704
1705 rq = __task_rq_lock(p);
1706 activate_task(rq, p, 0);
1707 p->on_rq = 1;
1708 trace_sched_wakeup_new(p, true);
1709 check_preempt_curr(rq, p, WF_FORK);
1710#ifdef CONFIG_SMP
1711 if (p->sched_class->task_woken)
1712 p->sched_class->task_woken(rq, p);
1713#endif
1714 task_rq_unlock(rq, p, &flags);
1715}
1716
1717#ifdef CONFIG_PREEMPT_NOTIFIERS
1718
1719/**
1720 * preempt_notifier_register - tell me when current is being preempted & rescheduled
1721 * @notifier: notifier struct to register
1722 */
1723void preempt_notifier_register(struct preempt_notifier *notifier)
1724{
1725 hlist_add_head(&notifier->link, &current->preempt_notifiers);
1726}
1727EXPORT_SYMBOL_GPL(preempt_notifier_register);
1728
1729/**
1730 * preempt_notifier_unregister - no longer interested in preemption notifications
1731 * @notifier: notifier struct to unregister
1732 *
1733 * This is safe to call from within a preemption notifier.
1734 */
1735void preempt_notifier_unregister(struct preempt_notifier *notifier)
1736{
1737 hlist_del(&notifier->link);
1738}
1739EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1740
1741static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1742{
1743 struct preempt_notifier *notifier;
1744 struct hlist_node *node;
1745
1746 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1747 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1748}
1749
1750static void
1751fire_sched_out_preempt_notifiers(struct task_struct *curr,
1752 struct task_struct *next)
1753{
1754 struct preempt_notifier *notifier;
1755 struct hlist_node *node;
1756
1757 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1758 notifier->ops->sched_out(notifier, next);
1759}
1760
1761#else /* !CONFIG_PREEMPT_NOTIFIERS */
1762
1763static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1764{
1765}
1766
1767static void
1768fire_sched_out_preempt_notifiers(struct task_struct *curr,
1769 struct task_struct *next)
1770{
1771}
1772
1773#endif /* CONFIG_PREEMPT_NOTIFIERS */
1774
1775/**
1776 * prepare_task_switch - prepare to switch tasks
1777 * @rq: the runqueue preparing to switch
1778 * @prev: the current task that is being switched out
1779 * @next: the task we are going to switch to.
1780 *
1781 * This is called with the rq lock held and interrupts off. It must
1782 * be paired with a subsequent finish_task_switch after the context
1783 * switch.
1784 *
1785 * prepare_task_switch sets up locking and calls architecture specific
1786 * hooks.
1787 */
1788static inline void
1789prepare_task_switch(struct rq *rq, struct task_struct *prev,
1790 struct task_struct *next)
1791{
1792 trace_sched_switch(prev, next);
1793 sched_info_switch(prev, next);
1794 perf_event_task_sched_out(prev, next);
1795 fire_sched_out_preempt_notifiers(prev, next);
1796 prepare_lock_switch(rq, next);
1797 prepare_arch_switch(next);
1798}
1799
1800/**
1801 * finish_task_switch - clean up after a task-switch
1802 * @rq: runqueue associated with task-switch
1803 * @prev: the thread we just switched away from.
1804 *
1805 * finish_task_switch must be called after the context switch, paired
1806 * with a prepare_task_switch call before the context switch.
1807 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1808 * and do any other architecture-specific cleanup actions.
1809 *
1810 * Note that we may have delayed dropping an mm in context_switch(). If
1811 * so, we finish that here outside of the runqueue lock. (Doing it
1812 * with the lock held can cause deadlocks; see schedule() for
1813 * details.)
1814 */
1815static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1816 __releases(rq->lock)
1817{
1818 struct mm_struct *mm = rq->prev_mm;
1819 long prev_state;
1820
1821 rq->prev_mm = NULL;
1822
1823 /*
1824 * A task struct has one reference for the use as "current".
1825 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1826 * schedule one last time. The schedule call will never return, and
1827 * the scheduled task must drop that reference.
1828 * The test for TASK_DEAD must occur while the runqueue locks are
1829 * still held, otherwise prev could be scheduled on another cpu, die
1830 * there before we look at prev->state, and then the reference would
1831 * be dropped twice.
1832 * Manfred Spraul <manfred@colorfullife.com>
1833 */
1834 prev_state = prev->state;
1835 vtime_task_switch(prev);
1836 finish_arch_switch(prev);
1837 perf_event_task_sched_in(prev, current);
1838 finish_lock_switch(rq, prev);
1839 finish_arch_post_lock_switch();
1840
1841 fire_sched_in_preempt_notifiers(current);
1842 if (mm)
1843 mmdrop(mm);
1844 if (unlikely(prev_state == TASK_DEAD)) {
1845 /*
1846 * Remove function-return probe instances associated with this
1847 * task and put them back on the free list.
1848 */
1849 kprobe_flush_task(prev);
1850 put_task_struct(prev);
1851 }
1852}
1853
1854#ifdef CONFIG_SMP
1855
1856/* assumes rq->lock is held */
1857static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1858{
1859 if (prev->sched_class->pre_schedule)
1860 prev->sched_class->pre_schedule(rq, prev);
1861}
1862
1863/* rq->lock is NOT held, but preemption is disabled */
1864static inline void post_schedule(struct rq *rq)
1865{
1866 if (rq->post_schedule) {
1867 unsigned long flags;
1868
1869 raw_spin_lock_irqsave(&rq->lock, flags);
1870 if (rq->curr->sched_class->post_schedule)
1871 rq->curr->sched_class->post_schedule(rq);
1872 raw_spin_unlock_irqrestore(&rq->lock, flags);
1873
1874 rq->post_schedule = 0;
1875 }
1876}
1877
1878#else
1879
1880static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1881{
1882}
1883
1884static inline void post_schedule(struct rq *rq)
1885{
1886}
1887
1888#endif
1889
1890/**
1891 * schedule_tail - first thing a freshly forked thread must call.
1892 * @prev: the thread we just switched away from.
1893 */
1894asmlinkage void schedule_tail(struct task_struct *prev)
1895 __releases(rq->lock)
1896{
1897 struct rq *rq = this_rq();
1898
1899 finish_task_switch(rq, prev);
1900
1901 /*
1902 * FIXME: do we need to worry about rq being invalidated by the
1903 * task_switch?
1904 */
1905 post_schedule(rq);
1906
1907#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1908 /* In this case, finish_task_switch does not reenable preemption */
1909 preempt_enable();
1910#endif
1911 if (current->set_child_tid)
1912 put_user(task_pid_vnr(current), current->set_child_tid);
1913}
1914
1915/*
1916 * context_switch - switch to the new MM and the new
1917 * thread's register state.
1918 */
1919static inline void
1920context_switch(struct rq *rq, struct task_struct *prev,
1921 struct task_struct *next)
1922{
1923 struct mm_struct *mm, *oldmm;
1924
1925 prepare_task_switch(rq, prev, next);
1926
1927 mm = next->mm;
1928 oldmm = prev->active_mm;
1929 /*
1930 * For paravirt, this is coupled with an exit in switch_to to
1931 * combine the page table reload and the switch backend into
1932 * one hypercall.
1933 */
1934 arch_start_context_switch(prev);
1935
1936 if (!mm) {
1937 next->active_mm = oldmm;
1938 atomic_inc(&oldmm->mm_count);
1939 enter_lazy_tlb(oldmm, next);
1940 } else
1941 switch_mm(oldmm, mm, next);
1942
1943 if (!prev->mm) {
1944 prev->active_mm = NULL;
1945 rq->prev_mm = oldmm;
1946 }
1947 /*
1948 * Since the runqueue lock will be released by the next
1949 * task (which is an invalid locking op but in the case
1950 * of the scheduler it's an obvious special-case), so we
1951 * do an early lockdep release here:
1952 */
1953#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1954 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1955#endif
1956
1957 context_tracking_task_switch(prev, next);
1958 /* Here we just switch the register state and the stack. */
1959 switch_to(prev, next, prev);
1960
1961 barrier();
1962 /*
1963 * this_rq must be evaluated again because prev may have moved
1964 * CPUs since it called schedule(), thus the 'rq' on its stack
1965 * frame will be invalid.
1966 */
1967 finish_task_switch(this_rq(), prev);
1968}
1969
1970/*
1971 * nr_running, nr_uninterruptible and nr_context_switches:
1972 *
1973 * externally visible scheduler statistics: current number of runnable
1974 * threads, current number of uninterruptible-sleeping threads, total
1975 * number of context switches performed since bootup.
1976 */
1977unsigned long nr_running(void)
1978{
1979 unsigned long i, sum = 0;
1980
1981 for_each_online_cpu(i)
1982 sum += cpu_rq(i)->nr_running;
1983
1984 return sum;
1985}
1986
1987unsigned long nr_uninterruptible(void)
1988{
1989 unsigned long i, sum = 0;
1990
1991 for_each_possible_cpu(i)
1992 sum += cpu_rq(i)->nr_uninterruptible;
1993
1994 /*
1995 * Since we read the counters lockless, it might be slightly
1996 * inaccurate. Do not allow it to go below zero though:
1997 */
1998 if (unlikely((long)sum < 0))
1999 sum = 0;
2000
2001 return sum;
2002}
2003
2004unsigned long long nr_context_switches(void)
2005{
2006 int i;
2007 unsigned long long sum = 0;
2008
2009 for_each_possible_cpu(i)
2010 sum += cpu_rq(i)->nr_switches;
2011
2012 return sum;
2013}
2014
2015unsigned long nr_iowait(void)
2016{
2017 unsigned long i, sum = 0;
2018
2019 for_each_possible_cpu(i)
2020 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2021
2022 return sum;
2023}
2024
2025unsigned long nr_iowait_cpu(int cpu)
2026{
2027 struct rq *this = cpu_rq(cpu);
2028 return atomic_read(&this->nr_iowait);
2029}
2030
2031unsigned long this_cpu_load(void)
2032{
2033 struct rq *this = this_rq();
2034 return this->cpu_load[0];
2035}
2036
2037
2038/*
2039 * Global load-average calculations
2040 *
2041 * We take a distributed and async approach to calculating the global load-avg
2042 * in order to minimize overhead.
2043 *
2044 * The global load average is an exponentially decaying average of nr_running +
2045 * nr_uninterruptible.
2046 *
2047 * Once every LOAD_FREQ:
2048 *
2049 * nr_active = 0;
2050 * for_each_possible_cpu(cpu)
2051 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2052 *
2053 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2054 *
2055 * Due to a number of reasons the above turns in the mess below:
2056 *
2057 * - for_each_possible_cpu() is prohibitively expensive on machines with
2058 * serious number of cpus, therefore we need to take a distributed approach
2059 * to calculating nr_active.
2060 *
2061 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2062 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2063 *
2064 * So assuming nr_active := 0 when we start out -- true per definition, we
2065 * can simply take per-cpu deltas and fold those into a global accumulate
2066 * to obtain the same result. See calc_load_fold_active().
2067 *
2068 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2069 * across the machine, we assume 10 ticks is sufficient time for every
2070 * cpu to have completed this task.
2071 *
2072 * This places an upper-bound on the IRQ-off latency of the machine. Then
2073 * again, being late doesn't loose the delta, just wrecks the sample.
2074 *
2075 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2076 * this would add another cross-cpu cacheline miss and atomic operation
2077 * to the wakeup path. Instead we increment on whatever cpu the task ran
2078 * when it went into uninterruptible state and decrement on whatever cpu
2079 * did the wakeup. This means that only the sum of nr_uninterruptible over
2080 * all cpus yields the correct result.
2081 *
2082 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2083 */
2084
2085/* Variables and functions for calc_load */
2086static atomic_long_t calc_load_tasks;
2087static unsigned long calc_load_update;
2088unsigned long avenrun[3];
2089EXPORT_SYMBOL(avenrun); /* should be removed */
2090
2091/**
2092 * get_avenrun - get the load average array
2093 * @loads: pointer to dest load array
2094 * @offset: offset to add
2095 * @shift: shift count to shift the result left
2096 *
2097 * These values are estimates at best, so no need for locking.
2098 */
2099void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2100{
2101 loads[0] = (avenrun[0] + offset) << shift;
2102 loads[1] = (avenrun[1] + offset) << shift;
2103 loads[2] = (avenrun[2] + offset) << shift;
2104}
2105
2106static long calc_load_fold_active(struct rq *this_rq)
2107{
2108 long nr_active, delta = 0;
2109
2110 nr_active = this_rq->nr_running;
2111 nr_active += (long) this_rq->nr_uninterruptible;
2112
2113 if (nr_active != this_rq->calc_load_active) {
2114 delta = nr_active - this_rq->calc_load_active;
2115 this_rq->calc_load_active = nr_active;
2116 }
2117
2118 return delta;
2119}
2120
2121/*
2122 * a1 = a0 * e + a * (1 - e)
2123 */
2124static unsigned long
2125calc_load(unsigned long load, unsigned long exp, unsigned long active)
2126{
2127 load *= exp;
2128 load += active * (FIXED_1 - exp);
2129 load += 1UL << (FSHIFT - 1);
2130 return load >> FSHIFT;
2131}
2132
2133#ifdef CONFIG_NO_HZ
2134/*
2135 * Handle NO_HZ for the global load-average.
2136 *
2137 * Since the above described distributed algorithm to compute the global
2138 * load-average relies on per-cpu sampling from the tick, it is affected by
2139 * NO_HZ.
2140 *
2141 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2142 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2143 * when we read the global state.
2144 *
2145 * Obviously reality has to ruin such a delightfully simple scheme:
2146 *
2147 * - When we go NO_HZ idle during the window, we can negate our sample
2148 * contribution, causing under-accounting.
2149 *
2150 * We avoid this by keeping two idle-delta counters and flipping them
2151 * when the window starts, thus separating old and new NO_HZ load.
2152 *
2153 * The only trick is the slight shift in index flip for read vs write.
2154 *
2155 * 0s 5s 10s 15s
2156 * +10 +10 +10 +10
2157 * |-|-----------|-|-----------|-|-----------|-|
2158 * r:0 0 1 1 0 0 1 1 0
2159 * w:0 1 1 0 0 1 1 0 0
2160 *
2161 * This ensures we'll fold the old idle contribution in this window while
2162 * accumlating the new one.
2163 *
2164 * - When we wake up from NO_HZ idle during the window, we push up our
2165 * contribution, since we effectively move our sample point to a known
2166 * busy state.
2167 *
2168 * This is solved by pushing the window forward, and thus skipping the
2169 * sample, for this cpu (effectively using the idle-delta for this cpu which
2170 * was in effect at the time the window opened). This also solves the issue
2171 * of having to deal with a cpu having been in NOHZ idle for multiple
2172 * LOAD_FREQ intervals.
2173 *
2174 * When making the ILB scale, we should try to pull this in as well.
2175 */
2176static atomic_long_t calc_load_idle[2];
2177static int calc_load_idx;
2178
2179static inline int calc_load_write_idx(void)
2180{
2181 int idx = calc_load_idx;
2182
2183 /*
2184 * See calc_global_nohz(), if we observe the new index, we also
2185 * need to observe the new update time.
2186 */
2187 smp_rmb();
2188
2189 /*
2190 * If the folding window started, make sure we start writing in the
2191 * next idle-delta.
2192 */
2193 if (!time_before(jiffies, calc_load_update))
2194 idx++;
2195
2196 return idx & 1;
2197}
2198
2199static inline int calc_load_read_idx(void)
2200{
2201 return calc_load_idx & 1;
2202}
2203
2204void calc_load_enter_idle(void)
2205{
2206 struct rq *this_rq = this_rq();
2207 long delta;
2208
2209 /*
2210 * We're going into NOHZ mode, if there's any pending delta, fold it
2211 * into the pending idle delta.
2212 */
2213 delta = calc_load_fold_active(this_rq);
2214 if (delta) {
2215 int idx = calc_load_write_idx();
2216 atomic_long_add(delta, &calc_load_idle[idx]);
2217 }
2218}
2219
2220void calc_load_exit_idle(void)
2221{
2222 struct rq *this_rq = this_rq();
2223
2224 /*
2225 * If we're still before the sample window, we're done.
2226 */
2227 if (time_before(jiffies, this_rq->calc_load_update))
2228 return;
2229
2230 /*
2231 * We woke inside or after the sample window, this means we're already
2232 * accounted through the nohz accounting, so skip the entire deal and
2233 * sync up for the next window.
2234 */
2235 this_rq->calc_load_update = calc_load_update;
2236 if (time_before(jiffies, this_rq->calc_load_update + 10))
2237 this_rq->calc_load_update += LOAD_FREQ;
2238}
2239
2240static long calc_load_fold_idle(void)
2241{
2242 int idx = calc_load_read_idx();
2243 long delta = 0;
2244
2245 if (atomic_long_read(&calc_load_idle[idx]))
2246 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2247
2248 return delta;
2249}
2250
2251/**
2252 * fixed_power_int - compute: x^n, in O(log n) time
2253 *
2254 * @x: base of the power
2255 * @frac_bits: fractional bits of @x
2256 * @n: power to raise @x to.
2257 *
2258 * By exploiting the relation between the definition of the natural power
2259 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2260 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2261 * (where: n_i \elem {0, 1}, the binary vector representing n),
2262 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2263 * of course trivially computable in O(log_2 n), the length of our binary
2264 * vector.
2265 */
2266static unsigned long
2267fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2268{
2269 unsigned long result = 1UL << frac_bits;
2270
2271 if (n) for (;;) {
2272 if (n & 1) {
2273 result *= x;
2274 result += 1UL << (frac_bits - 1);
2275 result >>= frac_bits;
2276 }
2277 n >>= 1;
2278 if (!n)
2279 break;
2280 x *= x;
2281 x += 1UL << (frac_bits - 1);
2282 x >>= frac_bits;
2283 }
2284
2285 return result;
2286}
2287
2288/*
2289 * a1 = a0 * e + a * (1 - e)
2290 *
2291 * a2 = a1 * e + a * (1 - e)
2292 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2293 * = a0 * e^2 + a * (1 - e) * (1 + e)
2294 *
2295 * a3 = a2 * e + a * (1 - e)
2296 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2297 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2298 *
2299 * ...
2300 *
2301 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2302 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2303 * = a0 * e^n + a * (1 - e^n)
2304 *
2305 * [1] application of the geometric series:
2306 *
2307 * n 1 - x^(n+1)
2308 * S_n := \Sum x^i = -------------
2309 * i=0 1 - x
2310 */
2311static unsigned long
2312calc_load_n(unsigned long load, unsigned long exp,
2313 unsigned long active, unsigned int n)
2314{
2315
2316 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2317}
2318
2319/*
2320 * NO_HZ can leave us missing all per-cpu ticks calling
2321 * calc_load_account_active(), but since an idle CPU folds its delta into
2322 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2323 * in the pending idle delta if our idle period crossed a load cycle boundary.
2324 *
2325 * Once we've updated the global active value, we need to apply the exponential
2326 * weights adjusted to the number of cycles missed.
2327 */
2328static void calc_global_nohz(void)
2329{
2330 long delta, active, n;
2331
2332 if (!time_before(jiffies, calc_load_update + 10)) {
2333 /*
2334 * Catch-up, fold however many we are behind still
2335 */
2336 delta = jiffies - calc_load_update - 10;
2337 n = 1 + (delta / LOAD_FREQ);
2338
2339 active = atomic_long_read(&calc_load_tasks);
2340 active = active > 0 ? active * FIXED_1 : 0;
2341
2342 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2343 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2344 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2345
2346 calc_load_update += n * LOAD_FREQ;
2347 }
2348
2349 /*
2350 * Flip the idle index...
2351 *
2352 * Make sure we first write the new time then flip the index, so that
2353 * calc_load_write_idx() will see the new time when it reads the new
2354 * index, this avoids a double flip messing things up.
2355 */
2356 smp_wmb();
2357 calc_load_idx++;
2358}
2359#else /* !CONFIG_NO_HZ */
2360
2361static inline long calc_load_fold_idle(void) { return 0; }
2362static inline void calc_global_nohz(void) { }
2363
2364#endif /* CONFIG_NO_HZ */
2365
2366/*
2367 * calc_load - update the avenrun load estimates 10 ticks after the
2368 * CPUs have updated calc_load_tasks.
2369 */
2370void calc_global_load(unsigned long ticks)
2371{
2372 long active, delta;
2373
2374 if (time_before(jiffies, calc_load_update + 10))
2375 return;
2376
2377 /*
2378 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2379 */
2380 delta = calc_load_fold_idle();
2381 if (delta)
2382 atomic_long_add(delta, &calc_load_tasks);
2383
2384 active = atomic_long_read(&calc_load_tasks);
2385 active = active > 0 ? active * FIXED_1 : 0;
2386
2387 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2388 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2389 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2390
2391 calc_load_update += LOAD_FREQ;
2392
2393 /*
2394 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2395 */
2396 calc_global_nohz();
2397}
2398
2399/*
2400 * Called from update_cpu_load() to periodically update this CPU's
2401 * active count.
2402 */
2403static void calc_load_account_active(struct rq *this_rq)
2404{
2405 long delta;
2406
2407 if (time_before(jiffies, this_rq->calc_load_update))
2408 return;
2409
2410 delta = calc_load_fold_active(this_rq);
2411 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks);
2413
2414 this_rq->calc_load_update += LOAD_FREQ;
2415}
2416
2417/*
2418 * End of global load-average stuff
2419 */
2420
2421/*
2422 * The exact cpuload at various idx values, calculated at every tick would be
2423 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2424 *
2425 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2426 * on nth tick when cpu may be busy, then we have:
2427 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2428 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2429 *
2430 * decay_load_missed() below does efficient calculation of
2431 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2432 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2433 *
2434 * The calculation is approximated on a 128 point scale.
2435 * degrade_zero_ticks is the number of ticks after which load at any
2436 * particular idx is approximated to be zero.
2437 * degrade_factor is a precomputed table, a row for each load idx.
2438 * Each column corresponds to degradation factor for a power of two ticks,
2439 * based on 128 point scale.
2440 * Example:
2441 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2442 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2443 *
2444 * With this power of 2 load factors, we can degrade the load n times
2445 * by looking at 1 bits in n and doing as many mult/shift instead of
2446 * n mult/shifts needed by the exact degradation.
2447 */
2448#define DEGRADE_SHIFT 7
2449static const unsigned char
2450 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2451static const unsigned char
2452 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2453 {0, 0, 0, 0, 0, 0, 0, 0},
2454 {64, 32, 8, 0, 0, 0, 0, 0},
2455 {96, 72, 40, 12, 1, 0, 0},
2456 {112, 98, 75, 43, 15, 1, 0},
2457 {120, 112, 98, 76, 45, 16, 2} };
2458
2459/*
2460 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2461 * would be when CPU is idle and so we just decay the old load without
2462 * adding any new load.
2463 */
2464static unsigned long
2465decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2466{
2467 int j = 0;
2468
2469 if (!missed_updates)
2470 return load;
2471
2472 if (missed_updates >= degrade_zero_ticks[idx])
2473 return 0;
2474
2475 if (idx == 1)
2476 return load >> missed_updates;
2477
2478 while (missed_updates) {
2479 if (missed_updates % 2)
2480 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2481
2482 missed_updates >>= 1;
2483 j++;
2484 }
2485 return load;
2486}
2487
2488/*
2489 * Update rq->cpu_load[] statistics. This function is usually called every
2490 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2491 * every tick. We fix it up based on jiffies.
2492 */
2493static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2494 unsigned long pending_updates)
2495{
2496 int i, scale;
2497
2498 this_rq->nr_load_updates++;
2499
2500 /* Update our load: */
2501 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2502 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2503 unsigned long old_load, new_load;
2504
2505 /* scale is effectively 1 << i now, and >> i divides by scale */
2506
2507 old_load = this_rq->cpu_load[i];
2508 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2509 new_load = this_load;
2510 /*
2511 * Round up the averaging division if load is increasing. This
2512 * prevents us from getting stuck on 9 if the load is 10, for
2513 * example.
2514 */
2515 if (new_load > old_load)
2516 new_load += scale - 1;
2517
2518 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2519 }
2520
2521 sched_avg_update(this_rq);
2522}
2523
2524#ifdef CONFIG_NO_HZ
2525/*
2526 * There is no sane way to deal with nohz on smp when using jiffies because the
2527 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2528 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2529 *
2530 * Therefore we cannot use the delta approach from the regular tick since that
2531 * would seriously skew the load calculation. However we'll make do for those
2532 * updates happening while idle (nohz_idle_balance) or coming out of idle
2533 * (tick_nohz_idle_exit).
2534 *
2535 * This means we might still be one tick off for nohz periods.
2536 */
2537
2538/*
2539 * Called from nohz_idle_balance() to update the load ratings before doing the
2540 * idle balance.
2541 */
2542void update_idle_cpu_load(struct rq *this_rq)
2543{
2544 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2545 unsigned long load = this_rq->load.weight;
2546 unsigned long pending_updates;
2547
2548 /*
2549 * bail if there's load or we're actually up-to-date.
2550 */
2551 if (load || curr_jiffies == this_rq->last_load_update_tick)
2552 return;
2553
2554 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2555 this_rq->last_load_update_tick = curr_jiffies;
2556
2557 __update_cpu_load(this_rq, load, pending_updates);
2558}
2559
2560/*
2561 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2562 */
2563void update_cpu_load_nohz(void)
2564{
2565 struct rq *this_rq = this_rq();
2566 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2567 unsigned long pending_updates;
2568
2569 if (curr_jiffies == this_rq->last_load_update_tick)
2570 return;
2571
2572 raw_spin_lock(&this_rq->lock);
2573 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2574 if (pending_updates) {
2575 this_rq->last_load_update_tick = curr_jiffies;
2576 /*
2577 * We were idle, this means load 0, the current load might be
2578 * !0 due to remote wakeups and the sort.
2579 */
2580 __update_cpu_load(this_rq, 0, pending_updates);
2581 }
2582 raw_spin_unlock(&this_rq->lock);
2583}
2584#endif /* CONFIG_NO_HZ */
2585
2586/*
2587 * Called from scheduler_tick()
2588 */
2589static void update_cpu_load_active(struct rq *this_rq)
2590{
2591 /*
2592 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2593 */
2594 this_rq->last_load_update_tick = jiffies;
2595 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2596
2597 calc_load_account_active(this_rq);
2598}
2599
2600#ifdef CONFIG_SMP
2601
2602/*
2603 * sched_exec - execve() is a valuable balancing opportunity, because at
2604 * this point the task has the smallest effective memory and cache footprint.
2605 */
2606void sched_exec(void)
2607{
2608 struct task_struct *p = current;
2609 unsigned long flags;
2610 int dest_cpu;
2611
2612 raw_spin_lock_irqsave(&p->pi_lock, flags);
2613 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2614 if (dest_cpu == smp_processor_id())
2615 goto unlock;
2616
2617 if (likely(cpu_active(dest_cpu))) {
2618 struct migration_arg arg = { p, dest_cpu };
2619
2620 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2621 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2622 return;
2623 }
2624unlock:
2625 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2626}
2627
2628#endif
2629
2630DEFINE_PER_CPU(struct kernel_stat, kstat);
2631DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2632
2633EXPORT_PER_CPU_SYMBOL(kstat);
2634EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2635
2636/*
2637 * Return any ns on the sched_clock that have not yet been accounted in
2638 * @p in case that task is currently running.
2639 *
2640 * Called with task_rq_lock() held on @rq.
2641 */
2642static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2643{
2644 u64 ns = 0;
2645
2646 if (task_current(rq, p)) {
2647 update_rq_clock(rq);
2648 ns = rq->clock_task - p->se.exec_start;
2649 if ((s64)ns < 0)
2650 ns = 0;
2651 }
2652
2653 return ns;
2654}
2655
2656unsigned long long task_delta_exec(struct task_struct *p)
2657{
2658 unsigned long flags;
2659 struct rq *rq;
2660 u64 ns = 0;
2661
2662 rq = task_rq_lock(p, &flags);
2663 ns = do_task_delta_exec(p, rq);
2664 task_rq_unlock(rq, p, &flags);
2665
2666 return ns;
2667}
2668
2669/*
2670 * Return accounted runtime for the task.
2671 * In case the task is currently running, return the runtime plus current's
2672 * pending runtime that have not been accounted yet.
2673 */
2674unsigned long long task_sched_runtime(struct task_struct *p)
2675{
2676 unsigned long flags;
2677 struct rq *rq;
2678 u64 ns = 0;
2679
2680 rq = task_rq_lock(p, &flags);
2681 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2682 task_rq_unlock(rq, p, &flags);
2683
2684 return ns;
2685}
2686
2687/*
2688 * This function gets called by the timer code, with HZ frequency.
2689 * We call it with interrupts disabled.
2690 */
2691void scheduler_tick(void)
2692{
2693 int cpu = smp_processor_id();
2694 struct rq *rq = cpu_rq(cpu);
2695 struct task_struct *curr = rq->curr;
2696
2697 sched_clock_tick();
2698
2699 raw_spin_lock(&rq->lock);
2700 update_rq_clock(rq);
2701 update_cpu_load_active(rq);
2702 curr->sched_class->task_tick(rq, curr, 0);
2703 raw_spin_unlock(&rq->lock);
2704
2705 perf_event_task_tick();
2706
2707#ifdef CONFIG_SMP
2708 rq->idle_balance = idle_cpu(cpu);
2709 trigger_load_balance(rq, cpu);
2710#endif
2711}
2712
2713notrace unsigned long get_parent_ip(unsigned long addr)
2714{
2715 if (in_lock_functions(addr)) {
2716 addr = CALLER_ADDR2;
2717 if (in_lock_functions(addr))
2718 addr = CALLER_ADDR3;
2719 }
2720 return addr;
2721}
2722
2723#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2724 defined(CONFIG_PREEMPT_TRACER))
2725
2726void __kprobes add_preempt_count(int val)
2727{
2728#ifdef CONFIG_DEBUG_PREEMPT
2729 /*
2730 * Underflow?
2731 */
2732 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2733 return;
2734#endif
2735 preempt_count() += val;
2736#ifdef CONFIG_DEBUG_PREEMPT
2737 /*
2738 * Spinlock count overflowing soon?
2739 */
2740 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2741 PREEMPT_MASK - 10);
2742#endif
2743 if (preempt_count() == val)
2744 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2745}
2746EXPORT_SYMBOL(add_preempt_count);
2747
2748void __kprobes sub_preempt_count(int val)
2749{
2750#ifdef CONFIG_DEBUG_PREEMPT
2751 /*
2752 * Underflow?
2753 */
2754 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2755 return;
2756 /*
2757 * Is the spinlock portion underflowing?
2758 */
2759 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2760 !(preempt_count() & PREEMPT_MASK)))
2761 return;
2762#endif
2763
2764 if (preempt_count() == val)
2765 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2766 preempt_count() -= val;
2767}
2768EXPORT_SYMBOL(sub_preempt_count);
2769
2770#endif
2771
2772/*
2773 * Print scheduling while atomic bug:
2774 */
2775static noinline void __schedule_bug(struct task_struct *prev)
2776{
2777 if (oops_in_progress)
2778 return;
2779
2780 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2781 prev->comm, prev->pid, preempt_count());
2782
2783 debug_show_held_locks(prev);
2784 print_modules();
2785 if (irqs_disabled())
2786 print_irqtrace_events(prev);
2787 dump_stack();
2788 add_taint(TAINT_WARN);
2789}
2790
2791/*
2792 * Various schedule()-time debugging checks and statistics:
2793 */
2794static inline void schedule_debug(struct task_struct *prev)
2795{
2796 /*
2797 * Test if we are atomic. Since do_exit() needs to call into
2798 * schedule() atomically, we ignore that path for now.
2799 * Otherwise, whine if we are scheduling when we should not be.
2800 */
2801 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2802 __schedule_bug(prev);
2803 rcu_sleep_check();
2804
2805 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2806
2807 schedstat_inc(this_rq(), sched_count);
2808}
2809
2810static void put_prev_task(struct rq *rq, struct task_struct *prev)
2811{
2812 if (prev->on_rq || rq->skip_clock_update < 0)
2813 update_rq_clock(rq);
2814 prev->sched_class->put_prev_task(rq, prev);
2815}
2816
2817/*
2818 * Pick up the highest-prio task:
2819 */
2820static inline struct task_struct *
2821pick_next_task(struct rq *rq)
2822{
2823 const struct sched_class *class;
2824 struct task_struct *p;
2825
2826 /*
2827 * Optimization: we know that if all tasks are in
2828 * the fair class we can call that function directly:
2829 */
2830 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2831 p = fair_sched_class.pick_next_task(rq);
2832 if (likely(p))
2833 return p;
2834 }
2835
2836 for_each_class(class) {
2837 p = class->pick_next_task(rq);
2838 if (p)
2839 return p;
2840 }
2841
2842 BUG(); /* the idle class will always have a runnable task */
2843}
2844
2845/*
2846 * __schedule() is the main scheduler function.
2847 *
2848 * The main means of driving the scheduler and thus entering this function are:
2849 *
2850 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2851 *
2852 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2853 * paths. For example, see arch/x86/entry_64.S.
2854 *
2855 * To drive preemption between tasks, the scheduler sets the flag in timer
2856 * interrupt handler scheduler_tick().
2857 *
2858 * 3. Wakeups don't really cause entry into schedule(). They add a
2859 * task to the run-queue and that's it.
2860 *
2861 * Now, if the new task added to the run-queue preempts the current
2862 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2863 * called on the nearest possible occasion:
2864 *
2865 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2866 *
2867 * - in syscall or exception context, at the next outmost
2868 * preempt_enable(). (this might be as soon as the wake_up()'s
2869 * spin_unlock()!)
2870 *
2871 * - in IRQ context, return from interrupt-handler to
2872 * preemptible context
2873 *
2874 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2875 * then at the next:
2876 *
2877 * - cond_resched() call
2878 * - explicit schedule() call
2879 * - return from syscall or exception to user-space
2880 * - return from interrupt-handler to user-space
2881 */
2882static void __sched __schedule(void)
2883{
2884 struct task_struct *prev, *next;
2885 unsigned long *switch_count;
2886 struct rq *rq;
2887 int cpu;
2888
2889need_resched:
2890 preempt_disable();
2891 cpu = smp_processor_id();
2892 rq = cpu_rq(cpu);
2893 rcu_note_context_switch(cpu);
2894 prev = rq->curr;
2895
2896 schedule_debug(prev);
2897
2898 if (sched_feat(HRTICK))
2899 hrtick_clear(rq);
2900
2901 raw_spin_lock_irq(&rq->lock);
2902
2903 switch_count = &prev->nivcsw;
2904 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2905 if (unlikely(signal_pending_state(prev->state, prev))) {
2906 prev->state = TASK_RUNNING;
2907 } else {
2908 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2909 prev->on_rq = 0;
2910
2911 /*
2912 * If a worker went to sleep, notify and ask workqueue
2913 * whether it wants to wake up a task to maintain
2914 * concurrency.
2915 */
2916 if (prev->flags & PF_WQ_WORKER) {
2917 struct task_struct *to_wakeup;
2918
2919 to_wakeup = wq_worker_sleeping(prev, cpu);
2920 if (to_wakeup)
2921 try_to_wake_up_local(to_wakeup);
2922 }
2923 }
2924 switch_count = &prev->nvcsw;
2925 }
2926
2927 pre_schedule(rq, prev);
2928
2929 if (unlikely(!rq->nr_running))
2930 idle_balance(cpu, rq);
2931
2932 put_prev_task(rq, prev);
2933 next = pick_next_task(rq);
2934 clear_tsk_need_resched(prev);
2935 rq->skip_clock_update = 0;
2936
2937 if (likely(prev != next)) {
2938 rq->nr_switches++;
2939 rq->curr = next;
2940 ++*switch_count;
2941
2942 context_switch(rq, prev, next); /* unlocks the rq */
2943 /*
2944 * The context switch have flipped the stack from under us
2945 * and restored the local variables which were saved when
2946 * this task called schedule() in the past. prev == current
2947 * is still correct, but it can be moved to another cpu/rq.
2948 */
2949 cpu = smp_processor_id();
2950 rq = cpu_rq(cpu);
2951 } else
2952 raw_spin_unlock_irq(&rq->lock);
2953
2954 post_schedule(rq);
2955
2956 sched_preempt_enable_no_resched();
2957 if (need_resched())
2958 goto need_resched;
2959}
2960
2961static inline void sched_submit_work(struct task_struct *tsk)
2962{
2963 if (!tsk->state || tsk_is_pi_blocked(tsk))
2964 return;
2965 /*
2966 * If we are going to sleep and we have plugged IO queued,
2967 * make sure to submit it to avoid deadlocks.
2968 */
2969 if (blk_needs_flush_plug(tsk))
2970 blk_schedule_flush_plug(tsk);
2971}
2972
2973asmlinkage void __sched schedule(void)
2974{
2975 struct task_struct *tsk = current;
2976
2977 sched_submit_work(tsk);
2978 __schedule();
2979}
2980EXPORT_SYMBOL(schedule);
2981
2982#ifdef CONFIG_CONTEXT_TRACKING
2983asmlinkage void __sched schedule_user(void)
2984{
2985 /*
2986 * If we come here after a random call to set_need_resched(),
2987 * or we have been woken up remotely but the IPI has not yet arrived,
2988 * we haven't yet exited the RCU idle mode. Do it here manually until
2989 * we find a better solution.
2990 */
2991 user_exit();
2992 schedule();
2993 user_enter();
2994}
2995#endif
2996
2997/**
2998 * schedule_preempt_disabled - called with preemption disabled
2999 *
3000 * Returns with preemption disabled. Note: preempt_count must be 1
3001 */
3002void __sched schedule_preempt_disabled(void)
3003{
3004 sched_preempt_enable_no_resched();
3005 schedule();
3006 preempt_disable();
3007}
3008
3009#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3010
3011static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3012{
3013 if (lock->owner != owner)
3014 return false;
3015
3016 /*
3017 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3018 * lock->owner still matches owner, if that fails, owner might
3019 * point to free()d memory, if it still matches, the rcu_read_lock()
3020 * ensures the memory stays valid.
3021 */
3022 barrier();
3023
3024 return owner->on_cpu;
3025}
3026
3027/*
3028 * Look out! "owner" is an entirely speculative pointer
3029 * access and not reliable.
3030 */
3031int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3032{
3033 if (!sched_feat(OWNER_SPIN))
3034 return 0;
3035
3036 rcu_read_lock();
3037 while (owner_running(lock, owner)) {
3038 if (need_resched())
3039 break;
3040
3041 arch_mutex_cpu_relax();
3042 }
3043 rcu_read_unlock();
3044
3045 /*
3046 * We break out the loop above on need_resched() and when the
3047 * owner changed, which is a sign for heavy contention. Return
3048 * success only when lock->owner is NULL.
3049 */
3050 return lock->owner == NULL;
3051}
3052#endif
3053
3054#ifdef CONFIG_PREEMPT
3055/*
3056 * this is the entry point to schedule() from in-kernel preemption
3057 * off of preempt_enable. Kernel preemptions off return from interrupt
3058 * occur there and call schedule directly.
3059 */
3060asmlinkage void __sched notrace preempt_schedule(void)
3061{
3062 struct thread_info *ti = current_thread_info();
3063
3064 /*
3065 * If there is a non-zero preempt_count or interrupts are disabled,
3066 * we do not want to preempt the current task. Just return..
3067 */
3068 if (likely(ti->preempt_count || irqs_disabled()))
3069 return;
3070
3071 do {
3072 add_preempt_count_notrace(PREEMPT_ACTIVE);
3073 __schedule();
3074 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3075
3076 /*
3077 * Check again in case we missed a preemption opportunity
3078 * between schedule and now.
3079 */
3080 barrier();
3081 } while (need_resched());
3082}
3083EXPORT_SYMBOL(preempt_schedule);
3084
3085/*
3086 * this is the entry point to schedule() from kernel preemption
3087 * off of irq context.
3088 * Note, that this is called and return with irqs disabled. This will
3089 * protect us against recursive calling from irq.
3090 */
3091asmlinkage void __sched preempt_schedule_irq(void)
3092{
3093 struct thread_info *ti = current_thread_info();
3094
3095 /* Catch callers which need to be fixed */
3096 BUG_ON(ti->preempt_count || !irqs_disabled());
3097
3098 user_exit();
3099 do {
3100 add_preempt_count(PREEMPT_ACTIVE);
3101 local_irq_enable();
3102 __schedule();
3103 local_irq_disable();
3104 sub_preempt_count(PREEMPT_ACTIVE);
3105
3106 /*
3107 * Check again in case we missed a preemption opportunity
3108 * between schedule and now.
3109 */
3110 barrier();
3111 } while (need_resched());
3112}
3113
3114#endif /* CONFIG_PREEMPT */
3115
3116int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3117 void *key)
3118{
3119 return try_to_wake_up(curr->private, mode, wake_flags);
3120}
3121EXPORT_SYMBOL(default_wake_function);
3122
3123/*
3124 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3125 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3126 * number) then we wake all the non-exclusive tasks and one exclusive task.
3127 *
3128 * There are circumstances in which we can try to wake a task which has already
3129 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3130 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3131 */
3132static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3133 int nr_exclusive, int wake_flags, void *key)
3134{
3135 wait_queue_t *curr, *next;
3136
3137 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3138 unsigned flags = curr->flags;
3139
3140 if (curr->func(curr, mode, wake_flags, key) &&
3141 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3142 break;
3143 }
3144}
3145
3146/**
3147 * __wake_up - wake up threads blocked on a waitqueue.
3148 * @q: the waitqueue
3149 * @mode: which threads
3150 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3151 * @key: is directly passed to the wakeup function
3152 *
3153 * It may be assumed that this function implies a write memory barrier before
3154 * changing the task state if and only if any tasks are woken up.
3155 */
3156void __wake_up(wait_queue_head_t *q, unsigned int mode,
3157 int nr_exclusive, void *key)
3158{
3159 unsigned long flags;
3160
3161 spin_lock_irqsave(&q->lock, flags);
3162 __wake_up_common(q, mode, nr_exclusive, 0, key);
3163 spin_unlock_irqrestore(&q->lock, flags);
3164}
3165EXPORT_SYMBOL(__wake_up);
3166
3167/*
3168 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3169 */
3170void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3171{
3172 __wake_up_common(q, mode, nr, 0, NULL);
3173}
3174EXPORT_SYMBOL_GPL(__wake_up_locked);
3175
3176void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3177{
3178 __wake_up_common(q, mode, 1, 0, key);
3179}
3180EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3181
3182/**
3183 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3184 * @q: the waitqueue
3185 * @mode: which threads
3186 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3187 * @key: opaque value to be passed to wakeup targets
3188 *
3189 * The sync wakeup differs that the waker knows that it will schedule
3190 * away soon, so while the target thread will be woken up, it will not
3191 * be migrated to another CPU - ie. the two threads are 'synchronized'
3192 * with each other. This can prevent needless bouncing between CPUs.
3193 *
3194 * On UP it can prevent extra preemption.
3195 *
3196 * It may be assumed that this function implies a write memory barrier before
3197 * changing the task state if and only if any tasks are woken up.
3198 */
3199void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3200 int nr_exclusive, void *key)
3201{
3202 unsigned long flags;
3203 int wake_flags = WF_SYNC;
3204
3205 if (unlikely(!q))
3206 return;
3207
3208 if (unlikely(!nr_exclusive))
3209 wake_flags = 0;
3210
3211 spin_lock_irqsave(&q->lock, flags);
3212 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3213 spin_unlock_irqrestore(&q->lock, flags);
3214}
3215EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3216
3217/*
3218 * __wake_up_sync - see __wake_up_sync_key()
3219 */
3220void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3221{
3222 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3223}
3224EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3225
3226/**
3227 * complete: - signals a single thread waiting on this completion
3228 * @x: holds the state of this particular completion
3229 *
3230 * This will wake up a single thread waiting on this completion. Threads will be
3231 * awakened in the same order in which they were queued.
3232 *
3233 * See also complete_all(), wait_for_completion() and related routines.
3234 *
3235 * It may be assumed that this function implies a write memory barrier before
3236 * changing the task state if and only if any tasks are woken up.
3237 */
3238void complete(struct completion *x)
3239{
3240 unsigned long flags;
3241
3242 spin_lock_irqsave(&x->wait.lock, flags);
3243 x->done++;
3244 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3245 spin_unlock_irqrestore(&x->wait.lock, flags);
3246}
3247EXPORT_SYMBOL(complete);
3248
3249/**
3250 * complete_all: - signals all threads waiting on this completion
3251 * @x: holds the state of this particular completion
3252 *
3253 * This will wake up all threads waiting on this particular completion event.
3254 *
3255 * It may be assumed that this function implies a write memory barrier before
3256 * changing the task state if and only if any tasks are woken up.
3257 */
3258void complete_all(struct completion *x)
3259{
3260 unsigned long flags;
3261
3262 spin_lock_irqsave(&x->wait.lock, flags);
3263 x->done += UINT_MAX/2;
3264 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3265 spin_unlock_irqrestore(&x->wait.lock, flags);
3266}
3267EXPORT_SYMBOL(complete_all);
3268
3269static inline long __sched
3270do_wait_for_common(struct completion *x, long timeout, int state)
3271{
3272 if (!x->done) {
3273 DECLARE_WAITQUEUE(wait, current);
3274
3275 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3276 do {
3277 if (signal_pending_state(state, current)) {
3278 timeout = -ERESTARTSYS;
3279 break;
3280 }
3281 __set_current_state(state);
3282 spin_unlock_irq(&x->wait.lock);
3283 timeout = schedule_timeout(timeout);
3284 spin_lock_irq(&x->wait.lock);
3285 } while (!x->done && timeout);
3286 __remove_wait_queue(&x->wait, &wait);
3287 if (!x->done)
3288 return timeout;
3289 }
3290 x->done--;
3291 return timeout ?: 1;
3292}
3293
3294static long __sched
3295wait_for_common(struct completion *x, long timeout, int state)
3296{
3297 might_sleep();
3298
3299 spin_lock_irq(&x->wait.lock);
3300 timeout = do_wait_for_common(x, timeout, state);
3301 spin_unlock_irq(&x->wait.lock);
3302 return timeout;
3303}
3304
3305/**
3306 * wait_for_completion: - waits for completion of a task
3307 * @x: holds the state of this particular completion
3308 *
3309 * This waits to be signaled for completion of a specific task. It is NOT
3310 * interruptible and there is no timeout.
3311 *
3312 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3313 * and interrupt capability. Also see complete().
3314 */
3315void __sched wait_for_completion(struct completion *x)
3316{
3317 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3318}
3319EXPORT_SYMBOL(wait_for_completion);
3320
3321/**
3322 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3323 * @x: holds the state of this particular completion
3324 * @timeout: timeout value in jiffies
3325 *
3326 * This waits for either a completion of a specific task to be signaled or for a
3327 * specified timeout to expire. The timeout is in jiffies. It is not
3328 * interruptible.
3329 *
3330 * The return value is 0 if timed out, and positive (at least 1, or number of
3331 * jiffies left till timeout) if completed.
3332 */
3333unsigned long __sched
3334wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3335{
3336 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3337}
3338EXPORT_SYMBOL(wait_for_completion_timeout);
3339
3340/**
3341 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3342 * @x: holds the state of this particular completion
3343 *
3344 * This waits for completion of a specific task to be signaled. It is
3345 * interruptible.
3346 *
3347 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3348 */
3349int __sched wait_for_completion_interruptible(struct completion *x)
3350{
3351 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3352 if (t == -ERESTARTSYS)
3353 return t;
3354 return 0;
3355}
3356EXPORT_SYMBOL(wait_for_completion_interruptible);
3357
3358/**
3359 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3360 * @x: holds the state of this particular completion
3361 * @timeout: timeout value in jiffies
3362 *
3363 * This waits for either a completion of a specific task to be signaled or for a
3364 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3365 *
3366 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3367 * positive (at least 1, or number of jiffies left till timeout) if completed.
3368 */
3369long __sched
3370wait_for_completion_interruptible_timeout(struct completion *x,
3371 unsigned long timeout)
3372{
3373 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3374}
3375EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3376
3377/**
3378 * wait_for_completion_killable: - waits for completion of a task (killable)
3379 * @x: holds the state of this particular completion
3380 *
3381 * This waits to be signaled for completion of a specific task. It can be
3382 * interrupted by a kill signal.
3383 *
3384 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3385 */
3386int __sched wait_for_completion_killable(struct completion *x)
3387{
3388 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3389 if (t == -ERESTARTSYS)
3390 return t;
3391 return 0;
3392}
3393EXPORT_SYMBOL(wait_for_completion_killable);
3394
3395/**
3396 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3397 * @x: holds the state of this particular completion
3398 * @timeout: timeout value in jiffies
3399 *
3400 * This waits for either a completion of a specific task to be
3401 * signaled or for a specified timeout to expire. It can be
3402 * interrupted by a kill signal. The timeout is in jiffies.
3403 *
3404 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3405 * positive (at least 1, or number of jiffies left till timeout) if completed.
3406 */
3407long __sched
3408wait_for_completion_killable_timeout(struct completion *x,
3409 unsigned long timeout)
3410{
3411 return wait_for_common(x, timeout, TASK_KILLABLE);
3412}
3413EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3414
3415/**
3416 * try_wait_for_completion - try to decrement a completion without blocking
3417 * @x: completion structure
3418 *
3419 * Returns: 0 if a decrement cannot be done without blocking
3420 * 1 if a decrement succeeded.
3421 *
3422 * If a completion is being used as a counting completion,
3423 * attempt to decrement the counter without blocking. This
3424 * enables us to avoid waiting if the resource the completion
3425 * is protecting is not available.
3426 */
3427bool try_wait_for_completion(struct completion *x)
3428{
3429 unsigned long flags;
3430 int ret = 1;
3431
3432 spin_lock_irqsave(&x->wait.lock, flags);
3433 if (!x->done)
3434 ret = 0;
3435 else
3436 x->done--;
3437 spin_unlock_irqrestore(&x->wait.lock, flags);
3438 return ret;
3439}
3440EXPORT_SYMBOL(try_wait_for_completion);
3441
3442/**
3443 * completion_done - Test to see if a completion has any waiters
3444 * @x: completion structure
3445 *
3446 * Returns: 0 if there are waiters (wait_for_completion() in progress)
3447 * 1 if there are no waiters.
3448 *
3449 */
3450bool completion_done(struct completion *x)
3451{
3452 unsigned long flags;
3453 int ret = 1;
3454
3455 spin_lock_irqsave(&x->wait.lock, flags);
3456 if (!x->done)
3457 ret = 0;
3458 spin_unlock_irqrestore(&x->wait.lock, flags);
3459 return ret;
3460}
3461EXPORT_SYMBOL(completion_done);
3462
3463static long __sched
3464sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3465{
3466 unsigned long flags;
3467 wait_queue_t wait;
3468
3469 init_waitqueue_entry(&wait, current);
3470
3471 __set_current_state(state);
3472
3473 spin_lock_irqsave(&q->lock, flags);
3474 __add_wait_queue(q, &wait);
3475 spin_unlock(&q->lock);
3476 timeout = schedule_timeout(timeout);
3477 spin_lock_irq(&q->lock);
3478 __remove_wait_queue(q, &wait);
3479 spin_unlock_irqrestore(&q->lock, flags);
3480
3481 return timeout;
3482}
3483
3484void __sched interruptible_sleep_on(wait_queue_head_t *q)
3485{
3486 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3487}
3488EXPORT_SYMBOL(interruptible_sleep_on);
3489
3490long __sched
3491interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3492{
3493 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3494}
3495EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3496
3497void __sched sleep_on(wait_queue_head_t *q)
3498{
3499 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3500}
3501EXPORT_SYMBOL(sleep_on);
3502
3503long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3504{
3505 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3506}
3507EXPORT_SYMBOL(sleep_on_timeout);
3508
3509#ifdef CONFIG_RT_MUTEXES
3510
3511/*
3512 * rt_mutex_setprio - set the current priority of a task
3513 * @p: task
3514 * @prio: prio value (kernel-internal form)
3515 *
3516 * This function changes the 'effective' priority of a task. It does
3517 * not touch ->normal_prio like __setscheduler().
3518 *
3519 * Used by the rt_mutex code to implement priority inheritance logic.
3520 */
3521void rt_mutex_setprio(struct task_struct *p, int prio)
3522{
3523 int oldprio, on_rq, running;
3524 struct rq *rq;
3525 const struct sched_class *prev_class;
3526
3527 BUG_ON(prio < 0 || prio > MAX_PRIO);
3528
3529 rq = __task_rq_lock(p);
3530
3531 /*
3532 * Idle task boosting is a nono in general. There is one
3533 * exception, when PREEMPT_RT and NOHZ is active:
3534 *
3535 * The idle task calls get_next_timer_interrupt() and holds
3536 * the timer wheel base->lock on the CPU and another CPU wants
3537 * to access the timer (probably to cancel it). We can safely
3538 * ignore the boosting request, as the idle CPU runs this code
3539 * with interrupts disabled and will complete the lock
3540 * protected section without being interrupted. So there is no
3541 * real need to boost.
3542 */
3543 if (unlikely(p == rq->idle)) {
3544 WARN_ON(p != rq->curr);
3545 WARN_ON(p->pi_blocked_on);
3546 goto out_unlock;
3547 }
3548
3549 trace_sched_pi_setprio(p, prio);
3550 oldprio = p->prio;
3551 prev_class = p->sched_class;
3552 on_rq = p->on_rq;
3553 running = task_current(rq, p);
3554 if (on_rq)
3555 dequeue_task(rq, p, 0);
3556 if (running)
3557 p->sched_class->put_prev_task(rq, p);
3558
3559 if (rt_prio(prio))
3560 p->sched_class = &rt_sched_class;
3561 else
3562 p->sched_class = &fair_sched_class;
3563
3564 p->prio = prio;
3565
3566 if (running)
3567 p->sched_class->set_curr_task(rq);
3568 if (on_rq)
3569 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3570
3571 check_class_changed(rq, p, prev_class, oldprio);
3572out_unlock:
3573 __task_rq_unlock(rq);
3574}
3575#endif
3576void set_user_nice(struct task_struct *p, long nice)
3577{
3578 int old_prio, delta, on_rq;
3579 unsigned long flags;
3580 struct rq *rq;
3581
3582 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3583 return;
3584 /*
3585 * We have to be careful, if called from sys_setpriority(),
3586 * the task might be in the middle of scheduling on another CPU.
3587 */
3588 rq = task_rq_lock(p, &flags);
3589 /*
3590 * The RT priorities are set via sched_setscheduler(), but we still
3591 * allow the 'normal' nice value to be set - but as expected
3592 * it wont have any effect on scheduling until the task is
3593 * SCHED_FIFO/SCHED_RR:
3594 */
3595 if (task_has_rt_policy(p)) {
3596 p->static_prio = NICE_TO_PRIO(nice);
3597 goto out_unlock;
3598 }
3599 on_rq = p->on_rq;
3600 if (on_rq)
3601 dequeue_task(rq, p, 0);
3602
3603 p->static_prio = NICE_TO_PRIO(nice);
3604 set_load_weight(p);
3605 old_prio = p->prio;
3606 p->prio = effective_prio(p);
3607 delta = p->prio - old_prio;
3608
3609 if (on_rq) {
3610 enqueue_task(rq, p, 0);
3611 /*
3612 * If the task increased its priority or is running and
3613 * lowered its priority, then reschedule its CPU:
3614 */
3615 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3616 resched_task(rq->curr);
3617 }
3618out_unlock:
3619 task_rq_unlock(rq, p, &flags);
3620}
3621EXPORT_SYMBOL(set_user_nice);
3622
3623/*
3624 * can_nice - check if a task can reduce its nice value
3625 * @p: task
3626 * @nice: nice value
3627 */
3628int can_nice(const struct task_struct *p, const int nice)
3629{
3630 /* convert nice value [19,-20] to rlimit style value [1,40] */
3631 int nice_rlim = 20 - nice;
3632
3633 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3634 capable(CAP_SYS_NICE));
3635}
3636
3637#ifdef __ARCH_WANT_SYS_NICE
3638
3639/*
3640 * sys_nice - change the priority of the current process.
3641 * @increment: priority increment
3642 *
3643 * sys_setpriority is a more generic, but much slower function that
3644 * does similar things.
3645 */
3646SYSCALL_DEFINE1(nice, int, increment)
3647{
3648 long nice, retval;
3649
3650 /*
3651 * Setpriority might change our priority at the same moment.
3652 * We don't have to worry. Conceptually one call occurs first
3653 * and we have a single winner.
3654 */
3655 if (increment < -40)
3656 increment = -40;
3657 if (increment > 40)
3658 increment = 40;
3659
3660 nice = TASK_NICE(current) + increment;
3661 if (nice < -20)
3662 nice = -20;
3663 if (nice > 19)
3664 nice = 19;
3665
3666 if (increment < 0 && !can_nice(current, nice))
3667 return -EPERM;
3668
3669 retval = security_task_setnice(current, nice);
3670 if (retval)
3671 return retval;
3672
3673 set_user_nice(current, nice);
3674 return 0;
3675}
3676
3677#endif
3678
3679/**
3680 * task_prio - return the priority value of a given task.
3681 * @p: the task in question.
3682 *
3683 * This is the priority value as seen by users in /proc.
3684 * RT tasks are offset by -200. Normal tasks are centered
3685 * around 0, value goes from -16 to +15.
3686 */
3687int task_prio(const struct task_struct *p)
3688{
3689 return p->prio - MAX_RT_PRIO;
3690}
3691
3692/**
3693 * task_nice - return the nice value of a given task.
3694 * @p: the task in question.
3695 */
3696int task_nice(const struct task_struct *p)
3697{
3698 return TASK_NICE(p);
3699}
3700EXPORT_SYMBOL(task_nice);
3701
3702/**
3703 * idle_cpu - is a given cpu idle currently?
3704 * @cpu: the processor in question.
3705 */
3706int idle_cpu(int cpu)
3707{
3708 struct rq *rq = cpu_rq(cpu);
3709
3710 if (rq->curr != rq->idle)
3711 return 0;
3712
3713 if (rq->nr_running)
3714 return 0;
3715
3716#ifdef CONFIG_SMP
3717 if (!llist_empty(&rq->wake_list))
3718 return 0;
3719#endif
3720
3721 return 1;
3722}
3723
3724/**
3725 * idle_task - return the idle task for a given cpu.
3726 * @cpu: the processor in question.
3727 */
3728struct task_struct *idle_task(int cpu)
3729{
3730 return cpu_rq(cpu)->idle;
3731}
3732
3733/**
3734 * find_process_by_pid - find a process with a matching PID value.
3735 * @pid: the pid in question.
3736 */
3737static struct task_struct *find_process_by_pid(pid_t pid)
3738{
3739 return pid ? find_task_by_vpid(pid) : current;
3740}
3741
3742/* Actually do priority change: must hold rq lock. */
3743static void
3744__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3745{
3746 p->policy = policy;
3747 p->rt_priority = prio;
3748 p->normal_prio = normal_prio(p);
3749 /* we are holding p->pi_lock already */
3750 p->prio = rt_mutex_getprio(p);
3751 if (rt_prio(p->prio))
3752 p->sched_class = &rt_sched_class;
3753 else
3754 p->sched_class = &fair_sched_class;
3755 set_load_weight(p);
3756}
3757
3758/*
3759 * check the target process has a UID that matches the current process's
3760 */
3761static bool check_same_owner(struct task_struct *p)
3762{
3763 const struct cred *cred = current_cred(), *pcred;
3764 bool match;
3765
3766 rcu_read_lock();
3767 pcred = __task_cred(p);
3768 match = (uid_eq(cred->euid, pcred->euid) ||
3769 uid_eq(cred->euid, pcred->uid));
3770 rcu_read_unlock();
3771 return match;
3772}
3773
3774static int __sched_setscheduler(struct task_struct *p, int policy,
3775 const struct sched_param *param, bool user)
3776{
3777 int retval, oldprio, oldpolicy = -1, on_rq, running;
3778 unsigned long flags;
3779 const struct sched_class *prev_class;
3780 struct rq *rq;
3781 int reset_on_fork;
3782
3783 /* may grab non-irq protected spin_locks */
3784 BUG_ON(in_interrupt());
3785recheck:
3786 /* double check policy once rq lock held */
3787 if (policy < 0) {
3788 reset_on_fork = p->sched_reset_on_fork;
3789 policy = oldpolicy = p->policy;
3790 } else {
3791 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3792 policy &= ~SCHED_RESET_ON_FORK;
3793
3794 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3795 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3796 policy != SCHED_IDLE)
3797 return -EINVAL;
3798 }
3799
3800 /*
3801 * Valid priorities for SCHED_FIFO and SCHED_RR are
3802 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3803 * SCHED_BATCH and SCHED_IDLE is 0.
3804 */
3805 if (param->sched_priority < 0 ||
3806 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3807 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3808 return -EINVAL;
3809 if (rt_policy(policy) != (param->sched_priority != 0))
3810 return -EINVAL;
3811
3812 /*
3813 * Allow unprivileged RT tasks to decrease priority:
3814 */
3815 if (user && !capable(CAP_SYS_NICE)) {
3816 if (rt_policy(policy)) {
3817 unsigned long rlim_rtprio =
3818 task_rlimit(p, RLIMIT_RTPRIO);
3819
3820 /* can't set/change the rt policy */
3821 if (policy != p->policy && !rlim_rtprio)
3822 return -EPERM;
3823
3824 /* can't increase priority */
3825 if (param->sched_priority > p->rt_priority &&
3826 param->sched_priority > rlim_rtprio)
3827 return -EPERM;
3828 }
3829
3830 /*
3831 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3832 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3833 */
3834 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3835 if (!can_nice(p, TASK_NICE(p)))
3836 return -EPERM;
3837 }
3838
3839 /* can't change other user's priorities */
3840 if (!check_same_owner(p))
3841 return -EPERM;
3842
3843 /* Normal users shall not reset the sched_reset_on_fork flag */
3844 if (p->sched_reset_on_fork && !reset_on_fork)
3845 return -EPERM;
3846 }
3847
3848 if (user) {
3849 retval = security_task_setscheduler(p);
3850 if (retval)
3851 return retval;
3852 }
3853
3854 /*
3855 * make sure no PI-waiters arrive (or leave) while we are
3856 * changing the priority of the task:
3857 *
3858 * To be able to change p->policy safely, the appropriate
3859 * runqueue lock must be held.
3860 */
3861 rq = task_rq_lock(p, &flags);
3862
3863 /*
3864 * Changing the policy of the stop threads its a very bad idea
3865 */
3866 if (p == rq->stop) {
3867 task_rq_unlock(rq, p, &flags);
3868 return -EINVAL;
3869 }
3870
3871 /*
3872 * If not changing anything there's no need to proceed further:
3873 */
3874 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3875 param->sched_priority == p->rt_priority))) {
3876 task_rq_unlock(rq, p, &flags);
3877 return 0;
3878 }
3879
3880#ifdef CONFIG_RT_GROUP_SCHED
3881 if (user) {
3882 /*
3883 * Do not allow realtime tasks into groups that have no runtime
3884 * assigned.
3885 */
3886 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3887 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3888 !task_group_is_autogroup(task_group(p))) {
3889 task_rq_unlock(rq, p, &flags);
3890 return -EPERM;
3891 }
3892 }
3893#endif
3894
3895 /* recheck policy now with rq lock held */
3896 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3897 policy = oldpolicy = -1;
3898 task_rq_unlock(rq, p, &flags);
3899 goto recheck;
3900 }
3901 on_rq = p->on_rq;
3902 running = task_current(rq, p);
3903 if (on_rq)
3904 dequeue_task(rq, p, 0);
3905 if (running)
3906 p->sched_class->put_prev_task(rq, p);
3907
3908 p->sched_reset_on_fork = reset_on_fork;
3909
3910 oldprio = p->prio;
3911 prev_class = p->sched_class;
3912 __setscheduler(rq, p, policy, param->sched_priority);
3913
3914 if (running)
3915 p->sched_class->set_curr_task(rq);
3916 if (on_rq)
3917 enqueue_task(rq, p, 0);
3918
3919 check_class_changed(rq, p, prev_class, oldprio);
3920 task_rq_unlock(rq, p, &flags);
3921
3922 rt_mutex_adjust_pi(p);
3923
3924 return 0;
3925}
3926
3927/**
3928 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3929 * @p: the task in question.
3930 * @policy: new policy.
3931 * @param: structure containing the new RT priority.
3932 *
3933 * NOTE that the task may be already dead.
3934 */
3935int sched_setscheduler(struct task_struct *p, int policy,
3936 const struct sched_param *param)
3937{
3938 return __sched_setscheduler(p, policy, param, true);
3939}
3940EXPORT_SYMBOL_GPL(sched_setscheduler);
3941
3942/**
3943 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3944 * @p: the task in question.
3945 * @policy: new policy.
3946 * @param: structure containing the new RT priority.
3947 *
3948 * Just like sched_setscheduler, only don't bother checking if the
3949 * current context has permission. For example, this is needed in
3950 * stop_machine(): we create temporary high priority worker threads,
3951 * but our caller might not have that capability.
3952 */
3953int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3954 const struct sched_param *param)
3955{
3956 return __sched_setscheduler(p, policy, param, false);
3957}
3958
3959static int
3960do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3961{
3962 struct sched_param lparam;
3963 struct task_struct *p;
3964 int retval;
3965
3966 if (!param || pid < 0)
3967 return -EINVAL;
3968 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3969 return -EFAULT;
3970
3971 rcu_read_lock();
3972 retval = -ESRCH;
3973 p = find_process_by_pid(pid);
3974 if (p != NULL)
3975 retval = sched_setscheduler(p, policy, &lparam);
3976 rcu_read_unlock();
3977
3978 return retval;
3979}
3980
3981/**
3982 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3983 * @pid: the pid in question.
3984 * @policy: new policy.
3985 * @param: structure containing the new RT priority.
3986 */
3987SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3988 struct sched_param __user *, param)
3989{
3990 /* negative values for policy are not valid */
3991 if (policy < 0)
3992 return -EINVAL;
3993
3994 return do_sched_setscheduler(pid, policy, param);
3995}
3996
3997/**
3998 * sys_sched_setparam - set/change the RT priority of a thread
3999 * @pid: the pid in question.
4000 * @param: structure containing the new RT priority.
4001 */
4002SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4003{
4004 return do_sched_setscheduler(pid, -1, param);
4005}
4006
4007/**
4008 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4009 * @pid: the pid in question.
4010 */
4011SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4012{
4013 struct task_struct *p;
4014 int retval;
4015
4016 if (pid < 0)
4017 return -EINVAL;
4018
4019 retval = -ESRCH;
4020 rcu_read_lock();
4021 p = find_process_by_pid(pid);
4022 if (p) {
4023 retval = security_task_getscheduler(p);
4024 if (!retval)
4025 retval = p->policy
4026 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4027 }
4028 rcu_read_unlock();
4029 return retval;
4030}
4031
4032/**
4033 * sys_sched_getparam - get the RT priority of a thread
4034 * @pid: the pid in question.
4035 * @param: structure containing the RT priority.
4036 */
4037SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4038{
4039 struct sched_param lp;
4040 struct task_struct *p;
4041 int retval;
4042
4043 if (!param || pid < 0)
4044 return -EINVAL;
4045
4046 rcu_read_lock();
4047 p = find_process_by_pid(pid);
4048 retval = -ESRCH;
4049 if (!p)
4050 goto out_unlock;
4051
4052 retval = security_task_getscheduler(p);
4053 if (retval)
4054 goto out_unlock;
4055
4056 lp.sched_priority = p->rt_priority;
4057 rcu_read_unlock();
4058
4059 /*
4060 * This one might sleep, we cannot do it with a spinlock held ...
4061 */
4062 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4063
4064 return retval;
4065
4066out_unlock:
4067 rcu_read_unlock();
4068 return retval;
4069}
4070
4071long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4072{
4073 cpumask_var_t cpus_allowed, new_mask;
4074 struct task_struct *p;
4075 int retval;
4076
4077 get_online_cpus();
4078 rcu_read_lock();
4079
4080 p = find_process_by_pid(pid);
4081 if (!p) {
4082 rcu_read_unlock();
4083 put_online_cpus();
4084 return -ESRCH;
4085 }
4086
4087 /* Prevent p going away */
4088 get_task_struct(p);
4089 rcu_read_unlock();
4090
4091 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4092 retval = -ENOMEM;
4093 goto out_put_task;
4094 }
4095 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4096 retval = -ENOMEM;
4097 goto out_free_cpus_allowed;
4098 }
4099 retval = -EPERM;
4100 if (!check_same_owner(p)) {
4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4108
4109 retval = security_task_setscheduler(p);
4110 if (retval)
4111 goto out_unlock;
4112
4113 cpuset_cpus_allowed(p, cpus_allowed);
4114 cpumask_and(new_mask, in_mask, cpus_allowed);
4115again:
4116 retval = set_cpus_allowed_ptr(p, new_mask);
4117
4118 if (!retval) {
4119 cpuset_cpus_allowed(p, cpus_allowed);
4120 if (!cpumask_subset(new_mask, cpus_allowed)) {
4121 /*
4122 * We must have raced with a concurrent cpuset
4123 * update. Just reset the cpus_allowed to the
4124 * cpuset's cpus_allowed
4125 */
4126 cpumask_copy(new_mask, cpus_allowed);
4127 goto again;
4128 }
4129 }
4130out_unlock:
4131 free_cpumask_var(new_mask);
4132out_free_cpus_allowed:
4133 free_cpumask_var(cpus_allowed);
4134out_put_task:
4135 put_task_struct(p);
4136 put_online_cpus();
4137 return retval;
4138}
4139
4140static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4141 struct cpumask *new_mask)
4142{
4143 if (len < cpumask_size())
4144 cpumask_clear(new_mask);
4145 else if (len > cpumask_size())
4146 len = cpumask_size();
4147
4148 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4149}
4150
4151/**
4152 * sys_sched_setaffinity - set the cpu affinity of a process
4153 * @pid: pid of the process
4154 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4155 * @user_mask_ptr: user-space pointer to the new cpu mask
4156 */
4157SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4158 unsigned long __user *, user_mask_ptr)
4159{
4160 cpumask_var_t new_mask;
4161 int retval;
4162
4163 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4164 return -ENOMEM;
4165
4166 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4167 if (retval == 0)
4168 retval = sched_setaffinity(pid, new_mask);
4169 free_cpumask_var(new_mask);
4170 return retval;
4171}
4172
4173long sched_getaffinity(pid_t pid, struct cpumask *mask)
4174{
4175 struct task_struct *p;
4176 unsigned long flags;
4177 int retval;
4178
4179 get_online_cpus();
4180 rcu_read_lock();
4181
4182 retval = -ESRCH;
4183 p = find_process_by_pid(pid);
4184 if (!p)
4185 goto out_unlock;
4186
4187 retval = security_task_getscheduler(p);
4188 if (retval)
4189 goto out_unlock;
4190
4191 raw_spin_lock_irqsave(&p->pi_lock, flags);
4192 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4193 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4194
4195out_unlock:
4196 rcu_read_unlock();
4197 put_online_cpus();
4198
4199 return retval;
4200}
4201
4202/**
4203 * sys_sched_getaffinity - get the cpu affinity of a process
4204 * @pid: pid of the process
4205 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4206 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4207 */
4208SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4209 unsigned long __user *, user_mask_ptr)
4210{
4211 int ret;
4212 cpumask_var_t mask;
4213
4214 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4215 return -EINVAL;
4216 if (len & (sizeof(unsigned long)-1))
4217 return -EINVAL;
4218
4219 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4220 return -ENOMEM;
4221
4222 ret = sched_getaffinity(pid, mask);
4223 if (ret == 0) {
4224 size_t retlen = min_t(size_t, len, cpumask_size());
4225
4226 if (copy_to_user(user_mask_ptr, mask, retlen))
4227 ret = -EFAULT;
4228 else
4229 ret = retlen;
4230 }
4231 free_cpumask_var(mask);
4232
4233 return ret;
4234}
4235
4236/**
4237 * sys_sched_yield - yield the current processor to other threads.
4238 *
4239 * This function yields the current CPU to other tasks. If there are no
4240 * other threads running on this CPU then this function will return.
4241 */
4242SYSCALL_DEFINE0(sched_yield)
4243{
4244 struct rq *rq = this_rq_lock();
4245
4246 schedstat_inc(rq, yld_count);
4247 current->sched_class->yield_task(rq);
4248
4249 /*
4250 * Since we are going to call schedule() anyway, there's
4251 * no need to preempt or enable interrupts:
4252 */
4253 __release(rq->lock);
4254 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4255 do_raw_spin_unlock(&rq->lock);
4256 sched_preempt_enable_no_resched();
4257
4258 schedule();
4259
4260 return 0;
4261}
4262
4263static inline int should_resched(void)
4264{
4265 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4266}
4267
4268static void __cond_resched(void)
4269{
4270 add_preempt_count(PREEMPT_ACTIVE);
4271 __schedule();
4272 sub_preempt_count(PREEMPT_ACTIVE);
4273}
4274
4275int __sched _cond_resched(void)
4276{
4277 if (should_resched()) {
4278 __cond_resched();
4279 return 1;
4280 }
4281 return 0;
4282}
4283EXPORT_SYMBOL(_cond_resched);
4284
4285/*
4286 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4287 * call schedule, and on return reacquire the lock.
4288 *
4289 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4290 * operations here to prevent schedule() from being called twice (once via
4291 * spin_unlock(), once by hand).
4292 */
4293int __cond_resched_lock(spinlock_t *lock)
4294{
4295 int resched = should_resched();
4296 int ret = 0;
4297
4298 lockdep_assert_held(lock);
4299
4300 if (spin_needbreak(lock) || resched) {
4301 spin_unlock(lock);
4302 if (resched)
4303 __cond_resched();
4304 else
4305 cpu_relax();
4306 ret = 1;
4307 spin_lock(lock);
4308 }
4309 return ret;
4310}
4311EXPORT_SYMBOL(__cond_resched_lock);
4312
4313int __sched __cond_resched_softirq(void)
4314{
4315 BUG_ON(!in_softirq());
4316
4317 if (should_resched()) {
4318 local_bh_enable();
4319 __cond_resched();
4320 local_bh_disable();
4321 return 1;
4322 }
4323 return 0;
4324}
4325EXPORT_SYMBOL(__cond_resched_softirq);
4326
4327/**
4328 * yield - yield the current processor to other threads.
4329 *
4330 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4331 *
4332 * The scheduler is at all times free to pick the calling task as the most
4333 * eligible task to run, if removing the yield() call from your code breaks
4334 * it, its already broken.
4335 *
4336 * Typical broken usage is:
4337 *
4338 * while (!event)
4339 * yield();
4340 *
4341 * where one assumes that yield() will let 'the other' process run that will
4342 * make event true. If the current task is a SCHED_FIFO task that will never
4343 * happen. Never use yield() as a progress guarantee!!
4344 *
4345 * If you want to use yield() to wait for something, use wait_event().
4346 * If you want to use yield() to be 'nice' for others, use cond_resched().
4347 * If you still want to use yield(), do not!
4348 */
4349void __sched yield(void)
4350{
4351 set_current_state(TASK_RUNNING);
4352 sys_sched_yield();
4353}
4354EXPORT_SYMBOL(yield);
4355
4356/**
4357 * yield_to - yield the current processor to another thread in
4358 * your thread group, or accelerate that thread toward the
4359 * processor it's on.
4360 * @p: target task
4361 * @preempt: whether task preemption is allowed or not
4362 *
4363 * It's the caller's job to ensure that the target task struct
4364 * can't go away on us before we can do any checks.
4365 *
4366 * Returns true if we indeed boosted the target task.
4367 */
4368bool __sched yield_to(struct task_struct *p, bool preempt)
4369{
4370 struct task_struct *curr = current;
4371 struct rq *rq, *p_rq;
4372 unsigned long flags;
4373 bool yielded = 0;
4374
4375 local_irq_save(flags);
4376 rq = this_rq();
4377
4378again:
4379 p_rq = task_rq(p);
4380 double_rq_lock(rq, p_rq);
4381 while (task_rq(p) != p_rq) {
4382 double_rq_unlock(rq, p_rq);
4383 goto again;
4384 }
4385
4386 if (!curr->sched_class->yield_to_task)
4387 goto out;
4388
4389 if (curr->sched_class != p->sched_class)
4390 goto out;
4391
4392 if (task_running(p_rq, p) || p->state)
4393 goto out;
4394
4395 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4396 if (yielded) {
4397 schedstat_inc(rq, yld_count);
4398 /*
4399 * Make p's CPU reschedule; pick_next_entity takes care of
4400 * fairness.
4401 */
4402 if (preempt && rq != p_rq)
4403 resched_task(p_rq->curr);
4404 }
4405
4406out:
4407 double_rq_unlock(rq, p_rq);
4408 local_irq_restore(flags);
4409
4410 if (yielded)
4411 schedule();
4412
4413 return yielded;
4414}
4415EXPORT_SYMBOL_GPL(yield_to);
4416
4417/*
4418 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4419 * that process accounting knows that this is a task in IO wait state.
4420 */
4421void __sched io_schedule(void)
4422{
4423 struct rq *rq = raw_rq();
4424
4425 delayacct_blkio_start();
4426 atomic_inc(&rq->nr_iowait);
4427 blk_flush_plug(current);
4428 current->in_iowait = 1;
4429 schedule();
4430 current->in_iowait = 0;
4431 atomic_dec(&rq->nr_iowait);
4432 delayacct_blkio_end();
4433}
4434EXPORT_SYMBOL(io_schedule);
4435
4436long __sched io_schedule_timeout(long timeout)
4437{
4438 struct rq *rq = raw_rq();
4439 long ret;
4440
4441 delayacct_blkio_start();
4442 atomic_inc(&rq->nr_iowait);
4443 blk_flush_plug(current);
4444 current->in_iowait = 1;
4445 ret = schedule_timeout(timeout);
4446 current->in_iowait = 0;
4447 atomic_dec(&rq->nr_iowait);
4448 delayacct_blkio_end();
4449 return ret;
4450}
4451
4452/**
4453 * sys_sched_get_priority_max - return maximum RT priority.
4454 * @policy: scheduling class.
4455 *
4456 * this syscall returns the maximum rt_priority that can be used
4457 * by a given scheduling class.
4458 */
4459SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4460{
4461 int ret = -EINVAL;
4462
4463 switch (policy) {
4464 case SCHED_FIFO:
4465 case SCHED_RR:
4466 ret = MAX_USER_RT_PRIO-1;
4467 break;
4468 case SCHED_NORMAL:
4469 case SCHED_BATCH:
4470 case SCHED_IDLE:
4471 ret = 0;
4472 break;
4473 }
4474 return ret;
4475}
4476
4477/**
4478 * sys_sched_get_priority_min - return minimum RT priority.
4479 * @policy: scheduling class.
4480 *
4481 * this syscall returns the minimum rt_priority that can be used
4482 * by a given scheduling class.
4483 */
4484SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4485{
4486 int ret = -EINVAL;
4487
4488 switch (policy) {
4489 case SCHED_FIFO:
4490 case SCHED_RR:
4491 ret = 1;
4492 break;
4493 case SCHED_NORMAL:
4494 case SCHED_BATCH:
4495 case SCHED_IDLE:
4496 ret = 0;
4497 }
4498 return ret;
4499}
4500
4501/**
4502 * sys_sched_rr_get_interval - return the default timeslice of a process.
4503 * @pid: pid of the process.
4504 * @interval: userspace pointer to the timeslice value.
4505 *
4506 * this syscall writes the default timeslice value of a given process
4507 * into the user-space timespec buffer. A value of '0' means infinity.
4508 */
4509SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4510 struct timespec __user *, interval)
4511{
4512 struct task_struct *p;
4513 unsigned int time_slice;
4514 unsigned long flags;
4515 struct rq *rq;
4516 int retval;
4517 struct timespec t;
4518
4519 if (pid < 0)
4520 return -EINVAL;
4521
4522 retval = -ESRCH;
4523 rcu_read_lock();
4524 p = find_process_by_pid(pid);
4525 if (!p)
4526 goto out_unlock;
4527
4528 retval = security_task_getscheduler(p);
4529 if (retval)
4530 goto out_unlock;
4531
4532 rq = task_rq_lock(p, &flags);
4533 time_slice = p->sched_class->get_rr_interval(rq, p);
4534 task_rq_unlock(rq, p, &flags);
4535
4536 rcu_read_unlock();
4537 jiffies_to_timespec(time_slice, &t);
4538 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4539 return retval;
4540
4541out_unlock:
4542 rcu_read_unlock();
4543 return retval;
4544}
4545
4546static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4547
4548void sched_show_task(struct task_struct *p)
4549{
4550 unsigned long free = 0;
4551 int ppid;
4552 unsigned state;
4553
4554 state = p->state ? __ffs(p->state) + 1 : 0;
4555 printk(KERN_INFO "%-15.15s %c", p->comm,
4556 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4557#if BITS_PER_LONG == 32
4558 if (state == TASK_RUNNING)
4559 printk(KERN_CONT " running ");
4560 else
4561 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4562#else
4563 if (state == TASK_RUNNING)
4564 printk(KERN_CONT " running task ");
4565 else
4566 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4567#endif
4568#ifdef CONFIG_DEBUG_STACK_USAGE
4569 free = stack_not_used(p);
4570#endif
4571 rcu_read_lock();
4572 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4573 rcu_read_unlock();
4574 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4575 task_pid_nr(p), ppid,
4576 (unsigned long)task_thread_info(p)->flags);
4577
4578 show_stack(p, NULL);
4579}
4580
4581void show_state_filter(unsigned long state_filter)
4582{
4583 struct task_struct *g, *p;
4584
4585#if BITS_PER_LONG == 32
4586 printk(KERN_INFO
4587 " task PC stack pid father\n");
4588#else
4589 printk(KERN_INFO
4590 " task PC stack pid father\n");
4591#endif
4592 rcu_read_lock();
4593 do_each_thread(g, p) {
4594 /*
4595 * reset the NMI-timeout, listing all files on a slow
4596 * console might take a lot of time:
4597 */
4598 touch_nmi_watchdog();
4599 if (!state_filter || (p->state & state_filter))
4600 sched_show_task(p);
4601 } while_each_thread(g, p);
4602
4603 touch_all_softlockup_watchdogs();
4604
4605#ifdef CONFIG_SCHED_DEBUG
4606 sysrq_sched_debug_show();
4607#endif
4608 rcu_read_unlock();
4609 /*
4610 * Only show locks if all tasks are dumped:
4611 */
4612 if (!state_filter)
4613 debug_show_all_locks();
4614}
4615
4616void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4617{
4618 idle->sched_class = &idle_sched_class;
4619}
4620
4621/**
4622 * init_idle - set up an idle thread for a given CPU
4623 * @idle: task in question
4624 * @cpu: cpu the idle task belongs to
4625 *
4626 * NOTE: this function does not set the idle thread's NEED_RESCHED
4627 * flag, to make booting more robust.
4628 */
4629void __cpuinit init_idle(struct task_struct *idle, int cpu)
4630{
4631 struct rq *rq = cpu_rq(cpu);
4632 unsigned long flags;
4633
4634 raw_spin_lock_irqsave(&rq->lock, flags);
4635
4636 __sched_fork(idle);
4637 idle->state = TASK_RUNNING;
4638 idle->se.exec_start = sched_clock();
4639
4640 do_set_cpus_allowed(idle, cpumask_of(cpu));
4641 /*
4642 * We're having a chicken and egg problem, even though we are
4643 * holding rq->lock, the cpu isn't yet set to this cpu so the
4644 * lockdep check in task_group() will fail.
4645 *
4646 * Similar case to sched_fork(). / Alternatively we could
4647 * use task_rq_lock() here and obtain the other rq->lock.
4648 *
4649 * Silence PROVE_RCU
4650 */
4651 rcu_read_lock();
4652 __set_task_cpu(idle, cpu);
4653 rcu_read_unlock();
4654
4655 rq->curr = rq->idle = idle;
4656#if defined(CONFIG_SMP)
4657 idle->on_cpu = 1;
4658#endif
4659 raw_spin_unlock_irqrestore(&rq->lock, flags);
4660
4661 /* Set the preempt count _outside_ the spinlocks! */
4662 task_thread_info(idle)->preempt_count = 0;
4663
4664 /*
4665 * The idle tasks have their own, simple scheduling class:
4666 */
4667 idle->sched_class = &idle_sched_class;
4668 ftrace_graph_init_idle_task(idle, cpu);
4669#if defined(CONFIG_SMP)
4670 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4671#endif
4672}
4673
4674#ifdef CONFIG_SMP
4675void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4676{
4677 if (p->sched_class && p->sched_class->set_cpus_allowed)
4678 p->sched_class->set_cpus_allowed(p, new_mask);
4679
4680 cpumask_copy(&p->cpus_allowed, new_mask);
4681 p->nr_cpus_allowed = cpumask_weight(new_mask);
4682}
4683
4684/*
4685 * This is how migration works:
4686 *
4687 * 1) we invoke migration_cpu_stop() on the target CPU using
4688 * stop_one_cpu().
4689 * 2) stopper starts to run (implicitly forcing the migrated thread
4690 * off the CPU)
4691 * 3) it checks whether the migrated task is still in the wrong runqueue.
4692 * 4) if it's in the wrong runqueue then the migration thread removes
4693 * it and puts it into the right queue.
4694 * 5) stopper completes and stop_one_cpu() returns and the migration
4695 * is done.
4696 */
4697
4698/*
4699 * Change a given task's CPU affinity. Migrate the thread to a
4700 * proper CPU and schedule it away if the CPU it's executing on
4701 * is removed from the allowed bitmask.
4702 *
4703 * NOTE: the caller must have a valid reference to the task, the
4704 * task must not exit() & deallocate itself prematurely. The
4705 * call is not atomic; no spinlocks may be held.
4706 */
4707int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4708{
4709 unsigned long flags;
4710 struct rq *rq;
4711 unsigned int dest_cpu;
4712 int ret = 0;
4713
4714 rq = task_rq_lock(p, &flags);
4715
4716 if (cpumask_equal(&p->cpus_allowed, new_mask))
4717 goto out;
4718
4719 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4720 ret = -EINVAL;
4721 goto out;
4722 }
4723
4724 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4725 ret = -EINVAL;
4726 goto out;
4727 }
4728
4729 do_set_cpus_allowed(p, new_mask);
4730
4731 /* Can the task run on the task's current CPU? If so, we're done */
4732 if (cpumask_test_cpu(task_cpu(p), new_mask))
4733 goto out;
4734
4735 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4736 if (p->on_rq) {
4737 struct migration_arg arg = { p, dest_cpu };
4738 /* Need help from migration thread: drop lock and wait. */
4739 task_rq_unlock(rq, p, &flags);
4740 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4741 tlb_migrate_finish(p->mm);
4742 return 0;
4743 }
4744out:
4745 task_rq_unlock(rq, p, &flags);
4746
4747 return ret;
4748}
4749EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4750
4751/*
4752 * Move (not current) task off this cpu, onto dest cpu. We're doing
4753 * this because either it can't run here any more (set_cpus_allowed()
4754 * away from this CPU, or CPU going down), or because we're
4755 * attempting to rebalance this task on exec (sched_exec).
4756 *
4757 * So we race with normal scheduler movements, but that's OK, as long
4758 * as the task is no longer on this CPU.
4759 *
4760 * Returns non-zero if task was successfully migrated.
4761 */
4762static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4763{
4764 struct rq *rq_dest, *rq_src;
4765 int ret = 0;
4766
4767 if (unlikely(!cpu_active(dest_cpu)))
4768 return ret;
4769
4770 rq_src = cpu_rq(src_cpu);
4771 rq_dest = cpu_rq(dest_cpu);
4772
4773 raw_spin_lock(&p->pi_lock);
4774 double_rq_lock(rq_src, rq_dest);
4775 /* Already moved. */
4776 if (task_cpu(p) != src_cpu)
4777 goto done;
4778 /* Affinity changed (again). */
4779 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4780 goto fail;
4781
4782 /*
4783 * If we're not on a rq, the next wake-up will ensure we're
4784 * placed properly.
4785 */
4786 if (p->on_rq) {
4787 dequeue_task(rq_src, p, 0);
4788 set_task_cpu(p, dest_cpu);
4789 enqueue_task(rq_dest, p, 0);
4790 check_preempt_curr(rq_dest, p, 0);
4791 }
4792done:
4793 ret = 1;
4794fail:
4795 double_rq_unlock(rq_src, rq_dest);
4796 raw_spin_unlock(&p->pi_lock);
4797 return ret;
4798}
4799
4800/*
4801 * migration_cpu_stop - this will be executed by a highprio stopper thread
4802 * and performs thread migration by bumping thread off CPU then
4803 * 'pushing' onto another runqueue.
4804 */
4805static int migration_cpu_stop(void *data)
4806{
4807 struct migration_arg *arg = data;
4808
4809 /*
4810 * The original target cpu might have gone down and we might
4811 * be on another cpu but it doesn't matter.
4812 */
4813 local_irq_disable();
4814 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4815 local_irq_enable();
4816 return 0;
4817}
4818
4819#ifdef CONFIG_HOTPLUG_CPU
4820
4821/*
4822 * Ensures that the idle task is using init_mm right before its cpu goes
4823 * offline.
4824 */
4825void idle_task_exit(void)
4826{
4827 struct mm_struct *mm = current->active_mm;
4828
4829 BUG_ON(cpu_online(smp_processor_id()));
4830
4831 if (mm != &init_mm)
4832 switch_mm(mm, &init_mm, current);
4833 mmdrop(mm);
4834}
4835
4836/*
4837 * Since this CPU is going 'away' for a while, fold any nr_active delta
4838 * we might have. Assumes we're called after migrate_tasks() so that the
4839 * nr_active count is stable.
4840 *
4841 * Also see the comment "Global load-average calculations".
4842 */
4843static void calc_load_migrate(struct rq *rq)
4844{
4845 long delta = calc_load_fold_active(rq);
4846 if (delta)
4847 atomic_long_add(delta, &calc_load_tasks);
4848}
4849
4850/*
4851 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4852 * try_to_wake_up()->select_task_rq().
4853 *
4854 * Called with rq->lock held even though we'er in stop_machine() and
4855 * there's no concurrency possible, we hold the required locks anyway
4856 * because of lock validation efforts.
4857 */
4858static void migrate_tasks(unsigned int dead_cpu)
4859{
4860 struct rq *rq = cpu_rq(dead_cpu);
4861 struct task_struct *next, *stop = rq->stop;
4862 int dest_cpu;
4863
4864 /*
4865 * Fudge the rq selection such that the below task selection loop
4866 * doesn't get stuck on the currently eligible stop task.
4867 *
4868 * We're currently inside stop_machine() and the rq is either stuck
4869 * in the stop_machine_cpu_stop() loop, or we're executing this code,
4870 * either way we should never end up calling schedule() until we're
4871 * done here.
4872 */
4873 rq->stop = NULL;
4874
4875 for ( ; ; ) {
4876 /*
4877 * There's this thread running, bail when that's the only
4878 * remaining thread.
4879 */
4880 if (rq->nr_running == 1)
4881 break;
4882
4883 next = pick_next_task(rq);
4884 BUG_ON(!next);
4885 next->sched_class->put_prev_task(rq, next);
4886
4887 /* Find suitable destination for @next, with force if needed. */
4888 dest_cpu = select_fallback_rq(dead_cpu, next);
4889 raw_spin_unlock(&rq->lock);
4890
4891 __migrate_task(next, dead_cpu, dest_cpu);
4892
4893 raw_spin_lock(&rq->lock);
4894 }
4895
4896 rq->stop = stop;
4897}
4898
4899#endif /* CONFIG_HOTPLUG_CPU */
4900
4901#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4902
4903static struct ctl_table sd_ctl_dir[] = {
4904 {
4905 .procname = "sched_domain",
4906 .mode = 0555,
4907 },
4908 {}
4909};
4910
4911static struct ctl_table sd_ctl_root[] = {
4912 {
4913 .procname = "kernel",
4914 .mode = 0555,
4915 .child = sd_ctl_dir,
4916 },
4917 {}
4918};
4919
4920static struct ctl_table *sd_alloc_ctl_entry(int n)
4921{
4922 struct ctl_table *entry =
4923 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4924
4925 return entry;
4926}
4927
4928static void sd_free_ctl_entry(struct ctl_table **tablep)
4929{
4930 struct ctl_table *entry;
4931
4932 /*
4933 * In the intermediate directories, both the child directory and
4934 * procname are dynamically allocated and could fail but the mode
4935 * will always be set. In the lowest directory the names are
4936 * static strings and all have proc handlers.
4937 */
4938 for (entry = *tablep; entry->mode; entry++) {
4939 if (entry->child)
4940 sd_free_ctl_entry(&entry->child);
4941 if (entry->proc_handler == NULL)
4942 kfree(entry->procname);
4943 }
4944
4945 kfree(*tablep);
4946 *tablep = NULL;
4947}
4948
4949static int min_load_idx = 0;
4950static int max_load_idx = CPU_LOAD_IDX_MAX;
4951
4952static void
4953set_table_entry(struct ctl_table *entry,
4954 const char *procname, void *data, int maxlen,
4955 umode_t mode, proc_handler *proc_handler,
4956 bool load_idx)
4957{
4958 entry->procname = procname;
4959 entry->data = data;
4960 entry->maxlen = maxlen;
4961 entry->mode = mode;
4962 entry->proc_handler = proc_handler;
4963
4964 if (load_idx) {
4965 entry->extra1 = &min_load_idx;
4966 entry->extra2 = &max_load_idx;
4967 }
4968}
4969
4970static struct ctl_table *
4971sd_alloc_ctl_domain_table(struct sched_domain *sd)
4972{
4973 struct ctl_table *table = sd_alloc_ctl_entry(13);
4974
4975 if (table == NULL)
4976 return NULL;
4977
4978 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4979 sizeof(long), 0644, proc_doulongvec_minmax, false);
4980 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4981 sizeof(long), 0644, proc_doulongvec_minmax, false);
4982 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4983 sizeof(int), 0644, proc_dointvec_minmax, true);
4984 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4985 sizeof(int), 0644, proc_dointvec_minmax, true);
4986 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4987 sizeof(int), 0644, proc_dointvec_minmax, true);
4988 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4989 sizeof(int), 0644, proc_dointvec_minmax, true);
4990 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4991 sizeof(int), 0644, proc_dointvec_minmax, true);
4992 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4993 sizeof(int), 0644, proc_dointvec_minmax, false);
4994 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4995 sizeof(int), 0644, proc_dointvec_minmax, false);
4996 set_table_entry(&table[9], "cache_nice_tries",
4997 &sd->cache_nice_tries,
4998 sizeof(int), 0644, proc_dointvec_minmax, false);
4999 set_table_entry(&table[10], "flags", &sd->flags,
5000 sizeof(int), 0644, proc_dointvec_minmax, false);
5001 set_table_entry(&table[11], "name", sd->name,
5002 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5003 /* &table[12] is terminator */
5004
5005 return table;
5006}
5007
5008static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5009{
5010 struct ctl_table *entry, *table;
5011 struct sched_domain *sd;
5012 int domain_num = 0, i;
5013 char buf[32];
5014
5015 for_each_domain(cpu, sd)
5016 domain_num++;
5017 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5018 if (table == NULL)
5019 return NULL;
5020
5021 i = 0;
5022 for_each_domain(cpu, sd) {
5023 snprintf(buf, 32, "domain%d", i);
5024 entry->procname = kstrdup(buf, GFP_KERNEL);
5025 entry->mode = 0555;
5026 entry->child = sd_alloc_ctl_domain_table(sd);
5027 entry++;
5028 i++;
5029 }
5030 return table;
5031}
5032
5033static struct ctl_table_header *sd_sysctl_header;
5034static void register_sched_domain_sysctl(void)
5035{
5036 int i, cpu_num = num_possible_cpus();
5037 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5038 char buf[32];
5039
5040 WARN_ON(sd_ctl_dir[0].child);
5041 sd_ctl_dir[0].child = entry;
5042
5043 if (entry == NULL)
5044 return;
5045
5046 for_each_possible_cpu(i) {
5047 snprintf(buf, 32, "cpu%d", i);
5048 entry->procname = kstrdup(buf, GFP_KERNEL);
5049 entry->mode = 0555;
5050 entry->child = sd_alloc_ctl_cpu_table(i);
5051 entry++;
5052 }
5053
5054 WARN_ON(sd_sysctl_header);
5055 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5056}
5057
5058/* may be called multiple times per register */
5059static void unregister_sched_domain_sysctl(void)
5060{
5061 if (sd_sysctl_header)
5062 unregister_sysctl_table(sd_sysctl_header);
5063 sd_sysctl_header = NULL;
5064 if (sd_ctl_dir[0].child)
5065 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5066}
5067#else
5068static void register_sched_domain_sysctl(void)
5069{
5070}
5071static void unregister_sched_domain_sysctl(void)
5072{
5073}
5074#endif
5075
5076static void set_rq_online(struct rq *rq)
5077{
5078 if (!rq->online) {
5079 const struct sched_class *class;
5080
5081 cpumask_set_cpu(rq->cpu, rq->rd->online);
5082 rq->online = 1;
5083
5084 for_each_class(class) {
5085 if (class->rq_online)
5086 class->rq_online(rq);
5087 }
5088 }
5089}
5090
5091static void set_rq_offline(struct rq *rq)
5092{
5093 if (rq->online) {
5094 const struct sched_class *class;
5095
5096 for_each_class(class) {
5097 if (class->rq_offline)
5098 class->rq_offline(rq);
5099 }
5100
5101 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5102 rq->online = 0;
5103 }
5104}
5105
5106/*
5107 * migration_call - callback that gets triggered when a CPU is added.
5108 * Here we can start up the necessary migration thread for the new CPU.
5109 */
5110static int __cpuinit
5111migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5112{
5113 int cpu = (long)hcpu;
5114 unsigned long flags;
5115 struct rq *rq = cpu_rq(cpu);
5116
5117 switch (action & ~CPU_TASKS_FROZEN) {
5118
5119 case CPU_UP_PREPARE:
5120 rq->calc_load_update = calc_load_update;
5121 break;
5122
5123 case CPU_ONLINE:
5124 /* Update our root-domain */
5125 raw_spin_lock_irqsave(&rq->lock, flags);
5126 if (rq->rd) {
5127 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5128
5129 set_rq_online(rq);
5130 }
5131 raw_spin_unlock_irqrestore(&rq->lock, flags);
5132 break;
5133
5134#ifdef CONFIG_HOTPLUG_CPU
5135 case CPU_DYING:
5136 sched_ttwu_pending();
5137 /* Update our root-domain */
5138 raw_spin_lock_irqsave(&rq->lock, flags);
5139 if (rq->rd) {
5140 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5141 set_rq_offline(rq);
5142 }
5143 migrate_tasks(cpu);
5144 BUG_ON(rq->nr_running != 1); /* the migration thread */
5145 raw_spin_unlock_irqrestore(&rq->lock, flags);
5146 break;
5147
5148 case CPU_DEAD:
5149 calc_load_migrate(rq);
5150 break;
5151#endif
5152 }
5153
5154 update_max_interval();
5155
5156 return NOTIFY_OK;
5157}
5158
5159/*
5160 * Register at high priority so that task migration (migrate_all_tasks)
5161 * happens before everything else. This has to be lower priority than
5162 * the notifier in the perf_event subsystem, though.
5163 */
5164static struct notifier_block __cpuinitdata migration_notifier = {
5165 .notifier_call = migration_call,
5166 .priority = CPU_PRI_MIGRATION,
5167};
5168
5169static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5170 unsigned long action, void *hcpu)
5171{
5172 switch (action & ~CPU_TASKS_FROZEN) {
5173 case CPU_STARTING:
5174 case CPU_DOWN_FAILED:
5175 set_cpu_active((long)hcpu, true);
5176 return NOTIFY_OK;
5177 default:
5178 return NOTIFY_DONE;
5179 }
5180}
5181
5182static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5183 unsigned long action, void *hcpu)
5184{
5185 switch (action & ~CPU_TASKS_FROZEN) {
5186 case CPU_DOWN_PREPARE:
5187 set_cpu_active((long)hcpu, false);
5188 return NOTIFY_OK;
5189 default:
5190 return NOTIFY_DONE;
5191 }
5192}
5193
5194static int __init migration_init(void)
5195{
5196 void *cpu = (void *)(long)smp_processor_id();
5197 int err;
5198
5199 /* Initialize migration for the boot CPU */
5200 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5201 BUG_ON(err == NOTIFY_BAD);
5202 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5203 register_cpu_notifier(&migration_notifier);
5204
5205 /* Register cpu active notifiers */
5206 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5207 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5208
5209 return 0;
5210}
5211early_initcall(migration_init);
5212#endif
5213
5214#ifdef CONFIG_SMP
5215
5216static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5217
5218#ifdef CONFIG_SCHED_DEBUG
5219
5220static __read_mostly int sched_debug_enabled;
5221
5222static int __init sched_debug_setup(char *str)
5223{
5224 sched_debug_enabled = 1;
5225
5226 return 0;
5227}
5228early_param("sched_debug", sched_debug_setup);
5229
5230static inline bool sched_debug(void)
5231{
5232 return sched_debug_enabled;
5233}
5234
5235static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5236 struct cpumask *groupmask)
5237{
5238 struct sched_group *group = sd->groups;
5239 char str[256];
5240
5241 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5242 cpumask_clear(groupmask);
5243
5244 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5245
5246 if (!(sd->flags & SD_LOAD_BALANCE)) {
5247 printk("does not load-balance\n");
5248 if (sd->parent)
5249 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5250 " has parent");
5251 return -1;
5252 }
5253
5254 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5255
5256 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5257 printk(KERN_ERR "ERROR: domain->span does not contain "
5258 "CPU%d\n", cpu);
5259 }
5260 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5261 printk(KERN_ERR "ERROR: domain->groups does not contain"
5262 " CPU%d\n", cpu);
5263 }
5264
5265 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5266 do {
5267 if (!group) {
5268 printk("\n");
5269 printk(KERN_ERR "ERROR: group is NULL\n");
5270 break;
5271 }
5272
5273 /*
5274 * Even though we initialize ->power to something semi-sane,
5275 * we leave power_orig unset. This allows us to detect if
5276 * domain iteration is still funny without causing /0 traps.
5277 */
5278 if (!group->sgp->power_orig) {
5279 printk(KERN_CONT "\n");
5280 printk(KERN_ERR "ERROR: domain->cpu_power not "
5281 "set\n");
5282 break;
5283 }
5284
5285 if (!cpumask_weight(sched_group_cpus(group))) {
5286 printk(KERN_CONT "\n");
5287 printk(KERN_ERR "ERROR: empty group\n");
5288 break;
5289 }
5290
5291 if (!(sd->flags & SD_OVERLAP) &&
5292 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5293 printk(KERN_CONT "\n");
5294 printk(KERN_ERR "ERROR: repeated CPUs\n");
5295 break;
5296 }
5297
5298 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5299
5300 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5301
5302 printk(KERN_CONT " %s", str);
5303 if (group->sgp->power != SCHED_POWER_SCALE) {
5304 printk(KERN_CONT " (cpu_power = %d)",
5305 group->sgp->power);
5306 }
5307
5308 group = group->next;
5309 } while (group != sd->groups);
5310 printk(KERN_CONT "\n");
5311
5312 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5313 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5314
5315 if (sd->parent &&
5316 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5317 printk(KERN_ERR "ERROR: parent span is not a superset "
5318 "of domain->span\n");
5319 return 0;
5320}
5321
5322static void sched_domain_debug(struct sched_domain *sd, int cpu)
5323{
5324 int level = 0;
5325
5326 if (!sched_debug_enabled)
5327 return;
5328
5329 if (!sd) {
5330 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5331 return;
5332 }
5333
5334 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5335
5336 for (;;) {
5337 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5338 break;
5339 level++;
5340 sd = sd->parent;
5341 if (!sd)
5342 break;
5343 }
5344}
5345#else /* !CONFIG_SCHED_DEBUG */
5346# define sched_domain_debug(sd, cpu) do { } while (0)
5347static inline bool sched_debug(void)
5348{
5349 return false;
5350}
5351#endif /* CONFIG_SCHED_DEBUG */
5352
5353static int sd_degenerate(struct sched_domain *sd)
5354{
5355 if (cpumask_weight(sched_domain_span(sd)) == 1)
5356 return 1;
5357
5358 /* Following flags need at least 2 groups */
5359 if (sd->flags & (SD_LOAD_BALANCE |
5360 SD_BALANCE_NEWIDLE |
5361 SD_BALANCE_FORK |
5362 SD_BALANCE_EXEC |
5363 SD_SHARE_CPUPOWER |
5364 SD_SHARE_PKG_RESOURCES)) {
5365 if (sd->groups != sd->groups->next)
5366 return 0;
5367 }
5368
5369 /* Following flags don't use groups */
5370 if (sd->flags & (SD_WAKE_AFFINE))
5371 return 0;
5372
5373 return 1;
5374}
5375
5376static int
5377sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5378{
5379 unsigned long cflags = sd->flags, pflags = parent->flags;
5380
5381 if (sd_degenerate(parent))
5382 return 1;
5383
5384 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5385 return 0;
5386
5387 /* Flags needing groups don't count if only 1 group in parent */
5388 if (parent->groups == parent->groups->next) {
5389 pflags &= ~(SD_LOAD_BALANCE |
5390 SD_BALANCE_NEWIDLE |
5391 SD_BALANCE_FORK |
5392 SD_BALANCE_EXEC |
5393 SD_SHARE_CPUPOWER |
5394 SD_SHARE_PKG_RESOURCES);
5395 if (nr_node_ids == 1)
5396 pflags &= ~SD_SERIALIZE;
5397 }
5398 if (~cflags & pflags)
5399 return 0;
5400
5401 return 1;
5402}
5403
5404static void free_rootdomain(struct rcu_head *rcu)
5405{
5406 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5407
5408 cpupri_cleanup(&rd->cpupri);
5409 free_cpumask_var(rd->rto_mask);
5410 free_cpumask_var(rd->online);
5411 free_cpumask_var(rd->span);
5412 kfree(rd);
5413}
5414
5415static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5416{
5417 struct root_domain *old_rd = NULL;
5418 unsigned long flags;
5419
5420 raw_spin_lock_irqsave(&rq->lock, flags);
5421
5422 if (rq->rd) {
5423 old_rd = rq->rd;
5424
5425 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5426 set_rq_offline(rq);
5427
5428 cpumask_clear_cpu(rq->cpu, old_rd->span);
5429
5430 /*
5431 * If we dont want to free the old_rt yet then
5432 * set old_rd to NULL to skip the freeing later
5433 * in this function:
5434 */
5435 if (!atomic_dec_and_test(&old_rd->refcount))
5436 old_rd = NULL;
5437 }
5438
5439 atomic_inc(&rd->refcount);
5440 rq->rd = rd;
5441
5442 cpumask_set_cpu(rq->cpu, rd->span);
5443 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5444 set_rq_online(rq);
5445
5446 raw_spin_unlock_irqrestore(&rq->lock, flags);
5447
5448 if (old_rd)
5449 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5450}
5451
5452static int init_rootdomain(struct root_domain *rd)
5453{
5454 memset(rd, 0, sizeof(*rd));
5455
5456 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5457 goto out;
5458 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5459 goto free_span;
5460 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5461 goto free_online;
5462
5463 if (cpupri_init(&rd->cpupri) != 0)
5464 goto free_rto_mask;
5465 return 0;
5466
5467free_rto_mask:
5468 free_cpumask_var(rd->rto_mask);
5469free_online:
5470 free_cpumask_var(rd->online);
5471free_span:
5472 free_cpumask_var(rd->span);
5473out:
5474 return -ENOMEM;
5475}
5476
5477/*
5478 * By default the system creates a single root-domain with all cpus as
5479 * members (mimicking the global state we have today).
5480 */
5481struct root_domain def_root_domain;
5482
5483static void init_defrootdomain(void)
5484{
5485 init_rootdomain(&def_root_domain);
5486
5487 atomic_set(&def_root_domain.refcount, 1);
5488}
5489
5490static struct root_domain *alloc_rootdomain(void)
5491{
5492 struct root_domain *rd;
5493
5494 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5495 if (!rd)
5496 return NULL;
5497
5498 if (init_rootdomain(rd) != 0) {
5499 kfree(rd);
5500 return NULL;
5501 }
5502
5503 return rd;
5504}
5505
5506static void free_sched_groups(struct sched_group *sg, int free_sgp)
5507{
5508 struct sched_group *tmp, *first;
5509
5510 if (!sg)
5511 return;
5512
5513 first = sg;
5514 do {
5515 tmp = sg->next;
5516
5517 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5518 kfree(sg->sgp);
5519
5520 kfree(sg);
5521 sg = tmp;
5522 } while (sg != first);
5523}
5524
5525static void free_sched_domain(struct rcu_head *rcu)
5526{
5527 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5528
5529 /*
5530 * If its an overlapping domain it has private groups, iterate and
5531 * nuke them all.
5532 */
5533 if (sd->flags & SD_OVERLAP) {
5534 free_sched_groups(sd->groups, 1);
5535 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5536 kfree(sd->groups->sgp);
5537 kfree(sd->groups);
5538 }
5539 kfree(sd);
5540}
5541
5542static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5543{
5544 call_rcu(&sd->rcu, free_sched_domain);
5545}
5546
5547static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5548{
5549 for (; sd; sd = sd->parent)
5550 destroy_sched_domain(sd, cpu);
5551}
5552
5553/*
5554 * Keep a special pointer to the highest sched_domain that has
5555 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5556 * allows us to avoid some pointer chasing select_idle_sibling().
5557 *
5558 * Also keep a unique ID per domain (we use the first cpu number in
5559 * the cpumask of the domain), this allows us to quickly tell if
5560 * two cpus are in the same cache domain, see cpus_share_cache().
5561 */
5562DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5563DEFINE_PER_CPU(int, sd_llc_id);
5564
5565static void update_top_cache_domain(int cpu)
5566{
5567 struct sched_domain *sd;
5568 int id = cpu;
5569
5570 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5571 if (sd)
5572 id = cpumask_first(sched_domain_span(sd));
5573
5574 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5575 per_cpu(sd_llc_id, cpu) = id;
5576}
5577
5578/*
5579 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5580 * hold the hotplug lock.
5581 */
5582static void
5583cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5584{
5585 struct rq *rq = cpu_rq(cpu);
5586 struct sched_domain *tmp;
5587
5588 /* Remove the sched domains which do not contribute to scheduling. */
5589 for (tmp = sd; tmp; ) {
5590 struct sched_domain *parent = tmp->parent;
5591 if (!parent)
5592 break;
5593
5594 if (sd_parent_degenerate(tmp, parent)) {
5595 tmp->parent = parent->parent;
5596 if (parent->parent)
5597 parent->parent->child = tmp;
5598 destroy_sched_domain(parent, cpu);
5599 } else
5600 tmp = tmp->parent;
5601 }
5602
5603 if (sd && sd_degenerate(sd)) {
5604 tmp = sd;
5605 sd = sd->parent;
5606 destroy_sched_domain(tmp, cpu);
5607 if (sd)
5608 sd->child = NULL;
5609 }
5610
5611 sched_domain_debug(sd, cpu);
5612
5613 rq_attach_root(rq, rd);
5614 tmp = rq->sd;
5615 rcu_assign_pointer(rq->sd, sd);
5616 destroy_sched_domains(tmp, cpu);
5617
5618 update_top_cache_domain(cpu);
5619}
5620
5621/* cpus with isolated domains */
5622static cpumask_var_t cpu_isolated_map;
5623
5624/* Setup the mask of cpus configured for isolated domains */
5625static int __init isolated_cpu_setup(char *str)
5626{
5627 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5628 cpulist_parse(str, cpu_isolated_map);
5629 return 1;
5630}
5631
5632__setup("isolcpus=", isolated_cpu_setup);
5633
5634static const struct cpumask *cpu_cpu_mask(int cpu)
5635{
5636 return cpumask_of_node(cpu_to_node(cpu));
5637}
5638
5639struct sd_data {
5640 struct sched_domain **__percpu sd;
5641 struct sched_group **__percpu sg;
5642 struct sched_group_power **__percpu sgp;
5643};
5644
5645struct s_data {
5646 struct sched_domain ** __percpu sd;
5647 struct root_domain *rd;
5648};
5649
5650enum s_alloc {
5651 sa_rootdomain,
5652 sa_sd,
5653 sa_sd_storage,
5654 sa_none,
5655};
5656
5657struct sched_domain_topology_level;
5658
5659typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5660typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5661
5662#define SDTL_OVERLAP 0x01
5663
5664struct sched_domain_topology_level {
5665 sched_domain_init_f init;
5666 sched_domain_mask_f mask;
5667 int flags;
5668 int numa_level;
5669 struct sd_data data;
5670};
5671
5672/*
5673 * Build an iteration mask that can exclude certain CPUs from the upwards
5674 * domain traversal.
5675 *
5676 * Asymmetric node setups can result in situations where the domain tree is of
5677 * unequal depth, make sure to skip domains that already cover the entire
5678 * range.
5679 *
5680 * In that case build_sched_domains() will have terminated the iteration early
5681 * and our sibling sd spans will be empty. Domains should always include the
5682 * cpu they're built on, so check that.
5683 *
5684 */
5685static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5686{
5687 const struct cpumask *span = sched_domain_span(sd);
5688 struct sd_data *sdd = sd->private;
5689 struct sched_domain *sibling;
5690 int i;
5691
5692 for_each_cpu(i, span) {
5693 sibling = *per_cpu_ptr(sdd->sd, i);
5694 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5695 continue;
5696
5697 cpumask_set_cpu(i, sched_group_mask(sg));
5698 }
5699}
5700
5701/*
5702 * Return the canonical balance cpu for this group, this is the first cpu
5703 * of this group that's also in the iteration mask.
5704 */
5705int group_balance_cpu(struct sched_group *sg)
5706{
5707 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5708}
5709
5710static int
5711build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5712{
5713 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5714 const struct cpumask *span = sched_domain_span(sd);
5715 struct cpumask *covered = sched_domains_tmpmask;
5716 struct sd_data *sdd = sd->private;
5717 struct sched_domain *child;
5718 int i;
5719
5720 cpumask_clear(covered);
5721
5722 for_each_cpu(i, span) {
5723 struct cpumask *sg_span;
5724
5725 if (cpumask_test_cpu(i, covered))
5726 continue;
5727
5728 child = *per_cpu_ptr(sdd->sd, i);
5729
5730 /* See the comment near build_group_mask(). */
5731 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5732 continue;
5733
5734 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5735 GFP_KERNEL, cpu_to_node(cpu));
5736
5737 if (!sg)
5738 goto fail;
5739
5740 sg_span = sched_group_cpus(sg);
5741 if (child->child) {
5742 child = child->child;
5743 cpumask_copy(sg_span, sched_domain_span(child));
5744 } else
5745 cpumask_set_cpu(i, sg_span);
5746
5747 cpumask_or(covered, covered, sg_span);
5748
5749 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5750 if (atomic_inc_return(&sg->sgp->ref) == 1)
5751 build_group_mask(sd, sg);
5752
5753 /*
5754 * Initialize sgp->power such that even if we mess up the
5755 * domains and no possible iteration will get us here, we won't
5756 * die on a /0 trap.
5757 */
5758 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5759
5760 /*
5761 * Make sure the first group of this domain contains the
5762 * canonical balance cpu. Otherwise the sched_domain iteration
5763 * breaks. See update_sg_lb_stats().
5764 */
5765 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5766 group_balance_cpu(sg) == cpu)
5767 groups = sg;
5768
5769 if (!first)
5770 first = sg;
5771 if (last)
5772 last->next = sg;
5773 last = sg;
5774 last->next = first;
5775 }
5776 sd->groups = groups;
5777
5778 return 0;
5779
5780fail:
5781 free_sched_groups(first, 0);
5782
5783 return -ENOMEM;
5784}
5785
5786static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5787{
5788 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5789 struct sched_domain *child = sd->child;
5790
5791 if (child)
5792 cpu = cpumask_first(sched_domain_span(child));
5793
5794 if (sg) {
5795 *sg = *per_cpu_ptr(sdd->sg, cpu);
5796 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5797 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
5798 }
5799
5800 return cpu;
5801}
5802
5803/*
5804 * build_sched_groups will build a circular linked list of the groups
5805 * covered by the given span, and will set each group's ->cpumask correctly,
5806 * and ->cpu_power to 0.
5807 *
5808 * Assumes the sched_domain tree is fully constructed
5809 */
5810static int
5811build_sched_groups(struct sched_domain *sd, int cpu)
5812{
5813 struct sched_group *first = NULL, *last = NULL;
5814 struct sd_data *sdd = sd->private;
5815 const struct cpumask *span = sched_domain_span(sd);
5816 struct cpumask *covered;
5817 int i;
5818
5819 get_group(cpu, sdd, &sd->groups);
5820 atomic_inc(&sd->groups->ref);
5821
5822 if (cpu != cpumask_first(sched_domain_span(sd)))
5823 return 0;
5824
5825 lockdep_assert_held(&sched_domains_mutex);
5826 covered = sched_domains_tmpmask;
5827
5828 cpumask_clear(covered);
5829
5830 for_each_cpu(i, span) {
5831 struct sched_group *sg;
5832 int group = get_group(i, sdd, &sg);
5833 int j;
5834
5835 if (cpumask_test_cpu(i, covered))
5836 continue;
5837
5838 cpumask_clear(sched_group_cpus(sg));
5839 sg->sgp->power = 0;
5840 cpumask_setall(sched_group_mask(sg));
5841
5842 for_each_cpu(j, span) {
5843 if (get_group(j, sdd, NULL) != group)
5844 continue;
5845
5846 cpumask_set_cpu(j, covered);
5847 cpumask_set_cpu(j, sched_group_cpus(sg));
5848 }
5849
5850 if (!first)
5851 first = sg;
5852 if (last)
5853 last->next = sg;
5854 last = sg;
5855 }
5856 last->next = first;
5857
5858 return 0;
5859}
5860
5861/*
5862 * Initialize sched groups cpu_power.
5863 *
5864 * cpu_power indicates the capacity of sched group, which is used while
5865 * distributing the load between different sched groups in a sched domain.
5866 * Typically cpu_power for all the groups in a sched domain will be same unless
5867 * there are asymmetries in the topology. If there are asymmetries, group
5868 * having more cpu_power will pickup more load compared to the group having
5869 * less cpu_power.
5870 */
5871static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5872{
5873 struct sched_group *sg = sd->groups;
5874
5875 WARN_ON(!sd || !sg);
5876
5877 do {
5878 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5879 sg = sg->next;
5880 } while (sg != sd->groups);
5881
5882 if (cpu != group_balance_cpu(sg))
5883 return;
5884
5885 update_group_power(sd, cpu);
5886 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5887}
5888
5889int __weak arch_sd_sibling_asym_packing(void)
5890{
5891 return 0*SD_ASYM_PACKING;
5892}
5893
5894/*
5895 * Initializers for schedule domains
5896 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5897 */
5898
5899#ifdef CONFIG_SCHED_DEBUG
5900# define SD_INIT_NAME(sd, type) sd->name = #type
5901#else
5902# define SD_INIT_NAME(sd, type) do { } while (0)
5903#endif
5904
5905#define SD_INIT_FUNC(type) \
5906static noinline struct sched_domain * \
5907sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5908{ \
5909 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5910 *sd = SD_##type##_INIT; \
5911 SD_INIT_NAME(sd, type); \
5912 sd->private = &tl->data; \
5913 return sd; \
5914}
5915
5916SD_INIT_FUNC(CPU)
5917#ifdef CONFIG_SCHED_SMT
5918 SD_INIT_FUNC(SIBLING)
5919#endif
5920#ifdef CONFIG_SCHED_MC
5921 SD_INIT_FUNC(MC)
5922#endif
5923#ifdef CONFIG_SCHED_BOOK
5924 SD_INIT_FUNC(BOOK)
5925#endif
5926
5927static int default_relax_domain_level = -1;
5928int sched_domain_level_max;
5929
5930static int __init setup_relax_domain_level(char *str)
5931{
5932 if (kstrtoint(str, 0, &default_relax_domain_level))
5933 pr_warn("Unable to set relax_domain_level\n");
5934
5935 return 1;
5936}
5937__setup("relax_domain_level=", setup_relax_domain_level);
5938
5939static void set_domain_attribute(struct sched_domain *sd,
5940 struct sched_domain_attr *attr)
5941{
5942 int request;
5943
5944 if (!attr || attr->relax_domain_level < 0) {
5945 if (default_relax_domain_level < 0)
5946 return;
5947 else
5948 request = default_relax_domain_level;
5949 } else
5950 request = attr->relax_domain_level;
5951 if (request < sd->level) {
5952 /* turn off idle balance on this domain */
5953 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5954 } else {
5955 /* turn on idle balance on this domain */
5956 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5957 }
5958}
5959
5960static void __sdt_free(const struct cpumask *cpu_map);
5961static int __sdt_alloc(const struct cpumask *cpu_map);
5962
5963static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5964 const struct cpumask *cpu_map)
5965{
5966 switch (what) {
5967 case sa_rootdomain:
5968 if (!atomic_read(&d->rd->refcount))
5969 free_rootdomain(&d->rd->rcu); /* fall through */
5970 case sa_sd:
5971 free_percpu(d->sd); /* fall through */
5972 case sa_sd_storage:
5973 __sdt_free(cpu_map); /* fall through */
5974 case sa_none:
5975 break;
5976 }
5977}
5978
5979static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5980 const struct cpumask *cpu_map)
5981{
5982 memset(d, 0, sizeof(*d));
5983
5984 if (__sdt_alloc(cpu_map))
5985 return sa_sd_storage;
5986 d->sd = alloc_percpu(struct sched_domain *);
5987 if (!d->sd)
5988 return sa_sd_storage;
5989 d->rd = alloc_rootdomain();
5990 if (!d->rd)
5991 return sa_sd;
5992 return sa_rootdomain;
5993}
5994
5995/*
5996 * NULL the sd_data elements we've used to build the sched_domain and
5997 * sched_group structure so that the subsequent __free_domain_allocs()
5998 * will not free the data we're using.
5999 */
6000static void claim_allocations(int cpu, struct sched_domain *sd)
6001{
6002 struct sd_data *sdd = sd->private;
6003
6004 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6005 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6006
6007 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6008 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6009
6010 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6011 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6012}
6013
6014#ifdef CONFIG_SCHED_SMT
6015static const struct cpumask *cpu_smt_mask(int cpu)
6016{
6017 return topology_thread_cpumask(cpu);
6018}
6019#endif
6020
6021/*
6022 * Topology list, bottom-up.
6023 */
6024static struct sched_domain_topology_level default_topology[] = {
6025#ifdef CONFIG_SCHED_SMT
6026 { sd_init_SIBLING, cpu_smt_mask, },
6027#endif
6028#ifdef CONFIG_SCHED_MC
6029 { sd_init_MC, cpu_coregroup_mask, },
6030#endif
6031#ifdef CONFIG_SCHED_BOOK
6032 { sd_init_BOOK, cpu_book_mask, },
6033#endif
6034 { sd_init_CPU, cpu_cpu_mask, },
6035 { NULL, },
6036};
6037
6038static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6039
6040#ifdef CONFIG_NUMA
6041
6042static int sched_domains_numa_levels;
6043static int *sched_domains_numa_distance;
6044static struct cpumask ***sched_domains_numa_masks;
6045static int sched_domains_curr_level;
6046
6047static inline int sd_local_flags(int level)
6048{
6049 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6050 return 0;
6051
6052 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6053}
6054
6055static struct sched_domain *
6056sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6057{
6058 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6059 int level = tl->numa_level;
6060 int sd_weight = cpumask_weight(
6061 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6062
6063 *sd = (struct sched_domain){
6064 .min_interval = sd_weight,
6065 .max_interval = 2*sd_weight,
6066 .busy_factor = 32,
6067 .imbalance_pct = 125,
6068 .cache_nice_tries = 2,
6069 .busy_idx = 3,
6070 .idle_idx = 2,
6071 .newidle_idx = 0,
6072 .wake_idx = 0,
6073 .forkexec_idx = 0,
6074
6075 .flags = 1*SD_LOAD_BALANCE
6076 | 1*SD_BALANCE_NEWIDLE
6077 | 0*SD_BALANCE_EXEC
6078 | 0*SD_BALANCE_FORK
6079 | 0*SD_BALANCE_WAKE
6080 | 0*SD_WAKE_AFFINE
6081 | 0*SD_SHARE_CPUPOWER
6082 | 0*SD_SHARE_PKG_RESOURCES
6083 | 1*SD_SERIALIZE
6084 | 0*SD_PREFER_SIBLING
6085 | sd_local_flags(level)
6086 ,
6087 .last_balance = jiffies,
6088 .balance_interval = sd_weight,
6089 };
6090 SD_INIT_NAME(sd, NUMA);
6091 sd->private = &tl->data;
6092
6093 /*
6094 * Ugly hack to pass state to sd_numa_mask()...
6095 */
6096 sched_domains_curr_level = tl->numa_level;
6097
6098 return sd;
6099}
6100
6101static const struct cpumask *sd_numa_mask(int cpu)
6102{
6103 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6104}
6105
6106static void sched_numa_warn(const char *str)
6107{
6108 static int done = false;
6109 int i,j;
6110
6111 if (done)
6112 return;
6113
6114 done = true;
6115
6116 printk(KERN_WARNING "ERROR: %s\n\n", str);
6117
6118 for (i = 0; i < nr_node_ids; i++) {
6119 printk(KERN_WARNING " ");
6120 for (j = 0; j < nr_node_ids; j++)
6121 printk(KERN_CONT "%02d ", node_distance(i,j));
6122 printk(KERN_CONT "\n");
6123 }
6124 printk(KERN_WARNING "\n");
6125}
6126
6127static bool find_numa_distance(int distance)
6128{
6129 int i;
6130
6131 if (distance == node_distance(0, 0))
6132 return true;
6133
6134 for (i = 0; i < sched_domains_numa_levels; i++) {
6135 if (sched_domains_numa_distance[i] == distance)
6136 return true;
6137 }
6138
6139 return false;
6140}
6141
6142static void sched_init_numa(void)
6143{
6144 int next_distance, curr_distance = node_distance(0, 0);
6145 struct sched_domain_topology_level *tl;
6146 int level = 0;
6147 int i, j, k;
6148
6149 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6150 if (!sched_domains_numa_distance)
6151 return;
6152
6153 /*
6154 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6155 * unique distances in the node_distance() table.
6156 *
6157 * Assumes node_distance(0,j) includes all distances in
6158 * node_distance(i,j) in order to avoid cubic time.
6159 */
6160 next_distance = curr_distance;
6161 for (i = 0; i < nr_node_ids; i++) {
6162 for (j = 0; j < nr_node_ids; j++) {
6163 for (k = 0; k < nr_node_ids; k++) {
6164 int distance = node_distance(i, k);
6165
6166 if (distance > curr_distance &&
6167 (distance < next_distance ||
6168 next_distance == curr_distance))
6169 next_distance = distance;
6170
6171 /*
6172 * While not a strong assumption it would be nice to know
6173 * about cases where if node A is connected to B, B is not
6174 * equally connected to A.
6175 */
6176 if (sched_debug() && node_distance(k, i) != distance)
6177 sched_numa_warn("Node-distance not symmetric");
6178
6179 if (sched_debug() && i && !find_numa_distance(distance))
6180 sched_numa_warn("Node-0 not representative");
6181 }
6182 if (next_distance != curr_distance) {
6183 sched_domains_numa_distance[level++] = next_distance;
6184 sched_domains_numa_levels = level;
6185 curr_distance = next_distance;
6186 } else break;
6187 }
6188
6189 /*
6190 * In case of sched_debug() we verify the above assumption.
6191 */
6192 if (!sched_debug())
6193 break;
6194 }
6195 /*
6196 * 'level' contains the number of unique distances, excluding the
6197 * identity distance node_distance(i,i).
6198 *
6199 * The sched_domains_nume_distance[] array includes the actual distance
6200 * numbers.
6201 */
6202
6203 /*
6204 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6205 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6206 * the array will contain less then 'level' members. This could be
6207 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6208 * in other functions.
6209 *
6210 * We reset it to 'level' at the end of this function.
6211 */
6212 sched_domains_numa_levels = 0;
6213
6214 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6215 if (!sched_domains_numa_masks)
6216 return;
6217
6218 /*
6219 * Now for each level, construct a mask per node which contains all
6220 * cpus of nodes that are that many hops away from us.
6221 */
6222 for (i = 0; i < level; i++) {
6223 sched_domains_numa_masks[i] =
6224 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6225 if (!sched_domains_numa_masks[i])
6226 return;
6227
6228 for (j = 0; j < nr_node_ids; j++) {
6229 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6230 if (!mask)
6231 return;
6232
6233 sched_domains_numa_masks[i][j] = mask;
6234
6235 for (k = 0; k < nr_node_ids; k++) {
6236 if (node_distance(j, k) > sched_domains_numa_distance[i])
6237 continue;
6238
6239 cpumask_or(mask, mask, cpumask_of_node(k));
6240 }
6241 }
6242 }
6243
6244 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6245 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6246 if (!tl)
6247 return;
6248
6249 /*
6250 * Copy the default topology bits..
6251 */
6252 for (i = 0; default_topology[i].init; i++)
6253 tl[i] = default_topology[i];
6254
6255 /*
6256 * .. and append 'j' levels of NUMA goodness.
6257 */
6258 for (j = 0; j < level; i++, j++) {
6259 tl[i] = (struct sched_domain_topology_level){
6260 .init = sd_numa_init,
6261 .mask = sd_numa_mask,
6262 .flags = SDTL_OVERLAP,
6263 .numa_level = j,
6264 };
6265 }
6266
6267 sched_domain_topology = tl;
6268
6269 sched_domains_numa_levels = level;
6270}
6271
6272static void sched_domains_numa_masks_set(int cpu)
6273{
6274 int i, j;
6275 int node = cpu_to_node(cpu);
6276
6277 for (i = 0; i < sched_domains_numa_levels; i++) {
6278 for (j = 0; j < nr_node_ids; j++) {
6279 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6280 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6281 }
6282 }
6283}
6284
6285static void sched_domains_numa_masks_clear(int cpu)
6286{
6287 int i, j;
6288 for (i = 0; i < sched_domains_numa_levels; i++) {
6289 for (j = 0; j < nr_node_ids; j++)
6290 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6291 }
6292}
6293
6294/*
6295 * Update sched_domains_numa_masks[level][node] array when new cpus
6296 * are onlined.
6297 */
6298static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6299 unsigned long action,
6300 void *hcpu)
6301{
6302 int cpu = (long)hcpu;
6303
6304 switch (action & ~CPU_TASKS_FROZEN) {
6305 case CPU_ONLINE:
6306 sched_domains_numa_masks_set(cpu);
6307 break;
6308
6309 case CPU_DEAD:
6310 sched_domains_numa_masks_clear(cpu);
6311 break;
6312
6313 default:
6314 return NOTIFY_DONE;
6315 }
6316
6317 return NOTIFY_OK;
6318}
6319#else
6320static inline void sched_init_numa(void)
6321{
6322}
6323
6324static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6325 unsigned long action,
6326 void *hcpu)
6327{
6328 return 0;
6329}
6330#endif /* CONFIG_NUMA */
6331
6332static int __sdt_alloc(const struct cpumask *cpu_map)
6333{
6334 struct sched_domain_topology_level *tl;
6335 int j;
6336
6337 for (tl = sched_domain_topology; tl->init; tl++) {
6338 struct sd_data *sdd = &tl->data;
6339
6340 sdd->sd = alloc_percpu(struct sched_domain *);
6341 if (!sdd->sd)
6342 return -ENOMEM;
6343
6344 sdd->sg = alloc_percpu(struct sched_group *);
6345 if (!sdd->sg)
6346 return -ENOMEM;
6347
6348 sdd->sgp = alloc_percpu(struct sched_group_power *);
6349 if (!sdd->sgp)
6350 return -ENOMEM;
6351
6352 for_each_cpu(j, cpu_map) {
6353 struct sched_domain *sd;
6354 struct sched_group *sg;
6355 struct sched_group_power *sgp;
6356
6357 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6358 GFP_KERNEL, cpu_to_node(j));
6359 if (!sd)
6360 return -ENOMEM;
6361
6362 *per_cpu_ptr(sdd->sd, j) = sd;
6363
6364 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6365 GFP_KERNEL, cpu_to_node(j));
6366 if (!sg)
6367 return -ENOMEM;
6368
6369 sg->next = sg;
6370
6371 *per_cpu_ptr(sdd->sg, j) = sg;
6372
6373 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6374 GFP_KERNEL, cpu_to_node(j));
6375 if (!sgp)
6376 return -ENOMEM;
6377
6378 *per_cpu_ptr(sdd->sgp, j) = sgp;
6379 }
6380 }
6381
6382 return 0;
6383}
6384
6385static void __sdt_free(const struct cpumask *cpu_map)
6386{
6387 struct sched_domain_topology_level *tl;
6388 int j;
6389
6390 for (tl = sched_domain_topology; tl->init; tl++) {
6391 struct sd_data *sdd = &tl->data;
6392
6393 for_each_cpu(j, cpu_map) {
6394 struct sched_domain *sd;
6395
6396 if (sdd->sd) {
6397 sd = *per_cpu_ptr(sdd->sd, j);
6398 if (sd && (sd->flags & SD_OVERLAP))
6399 free_sched_groups(sd->groups, 0);
6400 kfree(*per_cpu_ptr(sdd->sd, j));
6401 }
6402
6403 if (sdd->sg)
6404 kfree(*per_cpu_ptr(sdd->sg, j));
6405 if (sdd->sgp)
6406 kfree(*per_cpu_ptr(sdd->sgp, j));
6407 }
6408 free_percpu(sdd->sd);
6409 sdd->sd = NULL;
6410 free_percpu(sdd->sg);
6411 sdd->sg = NULL;
6412 free_percpu(sdd->sgp);
6413 sdd->sgp = NULL;
6414 }
6415}
6416
6417struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6418 struct s_data *d, const struct cpumask *cpu_map,
6419 struct sched_domain_attr *attr, struct sched_domain *child,
6420 int cpu)
6421{
6422 struct sched_domain *sd = tl->init(tl, cpu);
6423 if (!sd)
6424 return child;
6425
6426 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6427 if (child) {
6428 sd->level = child->level + 1;
6429 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6430 child->parent = sd;
6431 }
6432 sd->child = child;
6433 set_domain_attribute(sd, attr);
6434
6435 return sd;
6436}
6437
6438/*
6439 * Build sched domains for a given set of cpus and attach the sched domains
6440 * to the individual cpus
6441 */
6442static int build_sched_domains(const struct cpumask *cpu_map,
6443 struct sched_domain_attr *attr)
6444{
6445 enum s_alloc alloc_state = sa_none;
6446 struct sched_domain *sd;
6447 struct s_data d;
6448 int i, ret = -ENOMEM;
6449
6450 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6451 if (alloc_state != sa_rootdomain)
6452 goto error;
6453
6454 /* Set up domains for cpus specified by the cpu_map. */
6455 for_each_cpu(i, cpu_map) {
6456 struct sched_domain_topology_level *tl;
6457
6458 sd = NULL;
6459 for (tl = sched_domain_topology; tl->init; tl++) {
6460 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6461 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6462 sd->flags |= SD_OVERLAP;
6463 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6464 break;
6465 }
6466
6467 while (sd->child)
6468 sd = sd->child;
6469
6470 *per_cpu_ptr(d.sd, i) = sd;
6471 }
6472
6473 /* Build the groups for the domains */
6474 for_each_cpu(i, cpu_map) {
6475 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6476 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6477 if (sd->flags & SD_OVERLAP) {
6478 if (build_overlap_sched_groups(sd, i))
6479 goto error;
6480 } else {
6481 if (build_sched_groups(sd, i))
6482 goto error;
6483 }
6484 }
6485 }
6486
6487 /* Calculate CPU power for physical packages and nodes */
6488 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6489 if (!cpumask_test_cpu(i, cpu_map))
6490 continue;
6491
6492 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6493 claim_allocations(i, sd);
6494 init_sched_groups_power(i, sd);
6495 }
6496 }
6497
6498 /* Attach the domains */
6499 rcu_read_lock();
6500 for_each_cpu(i, cpu_map) {
6501 sd = *per_cpu_ptr(d.sd, i);
6502 cpu_attach_domain(sd, d.rd, i);
6503 }
6504 rcu_read_unlock();
6505
6506 ret = 0;
6507error:
6508 __free_domain_allocs(&d, alloc_state, cpu_map);
6509 return ret;
6510}
6511
6512static cpumask_var_t *doms_cur; /* current sched domains */
6513static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6514static struct sched_domain_attr *dattr_cur;
6515 /* attribues of custom domains in 'doms_cur' */
6516
6517/*
6518 * Special case: If a kmalloc of a doms_cur partition (array of
6519 * cpumask) fails, then fallback to a single sched domain,
6520 * as determined by the single cpumask fallback_doms.
6521 */
6522static cpumask_var_t fallback_doms;
6523
6524/*
6525 * arch_update_cpu_topology lets virtualized architectures update the
6526 * cpu core maps. It is supposed to return 1 if the topology changed
6527 * or 0 if it stayed the same.
6528 */
6529int __attribute__((weak)) arch_update_cpu_topology(void)
6530{
6531 return 0;
6532}
6533
6534cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6535{
6536 int i;
6537 cpumask_var_t *doms;
6538
6539 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6540 if (!doms)
6541 return NULL;
6542 for (i = 0; i < ndoms; i++) {
6543 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6544 free_sched_domains(doms, i);
6545 return NULL;
6546 }
6547 }
6548 return doms;
6549}
6550
6551void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6552{
6553 unsigned int i;
6554 for (i = 0; i < ndoms; i++)
6555 free_cpumask_var(doms[i]);
6556 kfree(doms);
6557}
6558
6559/*
6560 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6561 * For now this just excludes isolated cpus, but could be used to
6562 * exclude other special cases in the future.
6563 */
6564static int init_sched_domains(const struct cpumask *cpu_map)
6565{
6566 int err;
6567
6568 arch_update_cpu_topology();
6569 ndoms_cur = 1;
6570 doms_cur = alloc_sched_domains(ndoms_cur);
6571 if (!doms_cur)
6572 doms_cur = &fallback_doms;
6573 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6574 err = build_sched_domains(doms_cur[0], NULL);
6575 register_sched_domain_sysctl();
6576
6577 return err;
6578}
6579
6580/*
6581 * Detach sched domains from a group of cpus specified in cpu_map
6582 * These cpus will now be attached to the NULL domain
6583 */
6584static void detach_destroy_domains(const struct cpumask *cpu_map)
6585{
6586 int i;
6587
6588 rcu_read_lock();
6589 for_each_cpu(i, cpu_map)
6590 cpu_attach_domain(NULL, &def_root_domain, i);
6591 rcu_read_unlock();
6592}
6593
6594/* handle null as "default" */
6595static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6596 struct sched_domain_attr *new, int idx_new)
6597{
6598 struct sched_domain_attr tmp;
6599
6600 /* fast path */
6601 if (!new && !cur)
6602 return 1;
6603
6604 tmp = SD_ATTR_INIT;
6605 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6606 new ? (new + idx_new) : &tmp,
6607 sizeof(struct sched_domain_attr));
6608}
6609
6610/*
6611 * Partition sched domains as specified by the 'ndoms_new'
6612 * cpumasks in the array doms_new[] of cpumasks. This compares
6613 * doms_new[] to the current sched domain partitioning, doms_cur[].
6614 * It destroys each deleted domain and builds each new domain.
6615 *
6616 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6617 * The masks don't intersect (don't overlap.) We should setup one
6618 * sched domain for each mask. CPUs not in any of the cpumasks will
6619 * not be load balanced. If the same cpumask appears both in the
6620 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6621 * it as it is.
6622 *
6623 * The passed in 'doms_new' should be allocated using
6624 * alloc_sched_domains. This routine takes ownership of it and will
6625 * free_sched_domains it when done with it. If the caller failed the
6626 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6627 * and partition_sched_domains() will fallback to the single partition
6628 * 'fallback_doms', it also forces the domains to be rebuilt.
6629 *
6630 * If doms_new == NULL it will be replaced with cpu_online_mask.
6631 * ndoms_new == 0 is a special case for destroying existing domains,
6632 * and it will not create the default domain.
6633 *
6634 * Call with hotplug lock held
6635 */
6636void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6637 struct sched_domain_attr *dattr_new)
6638{
6639 int i, j, n;
6640 int new_topology;
6641
6642 mutex_lock(&sched_domains_mutex);
6643
6644 /* always unregister in case we don't destroy any domains */
6645 unregister_sched_domain_sysctl();
6646
6647 /* Let architecture update cpu core mappings. */
6648 new_topology = arch_update_cpu_topology();
6649
6650 n = doms_new ? ndoms_new : 0;
6651
6652 /* Destroy deleted domains */
6653 for (i = 0; i < ndoms_cur; i++) {
6654 for (j = 0; j < n && !new_topology; j++) {
6655 if (cpumask_equal(doms_cur[i], doms_new[j])
6656 && dattrs_equal(dattr_cur, i, dattr_new, j))
6657 goto match1;
6658 }
6659 /* no match - a current sched domain not in new doms_new[] */
6660 detach_destroy_domains(doms_cur[i]);
6661match1:
6662 ;
6663 }
6664
6665 if (doms_new == NULL) {
6666 ndoms_cur = 0;
6667 doms_new = &fallback_doms;
6668 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6669 WARN_ON_ONCE(dattr_new);
6670 }
6671
6672 /* Build new domains */
6673 for (i = 0; i < ndoms_new; i++) {
6674 for (j = 0; j < ndoms_cur && !new_topology; j++) {
6675 if (cpumask_equal(doms_new[i], doms_cur[j])
6676 && dattrs_equal(dattr_new, i, dattr_cur, j))
6677 goto match2;
6678 }
6679 /* no match - add a new doms_new */
6680 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6681match2:
6682 ;
6683 }
6684
6685 /* Remember the new sched domains */
6686 if (doms_cur != &fallback_doms)
6687 free_sched_domains(doms_cur, ndoms_cur);
6688 kfree(dattr_cur); /* kfree(NULL) is safe */
6689 doms_cur = doms_new;
6690 dattr_cur = dattr_new;
6691 ndoms_cur = ndoms_new;
6692
6693 register_sched_domain_sysctl();
6694
6695 mutex_unlock(&sched_domains_mutex);
6696}
6697
6698static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
6699
6700/*
6701 * Update cpusets according to cpu_active mask. If cpusets are
6702 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6703 * around partition_sched_domains().
6704 *
6705 * If we come here as part of a suspend/resume, don't touch cpusets because we
6706 * want to restore it back to its original state upon resume anyway.
6707 */
6708static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6709 void *hcpu)
6710{
6711 switch (action) {
6712 case CPU_ONLINE_FROZEN:
6713 case CPU_DOWN_FAILED_FROZEN:
6714
6715 /*
6716 * num_cpus_frozen tracks how many CPUs are involved in suspend
6717 * resume sequence. As long as this is not the last online
6718 * operation in the resume sequence, just build a single sched
6719 * domain, ignoring cpusets.
6720 */
6721 num_cpus_frozen--;
6722 if (likely(num_cpus_frozen)) {
6723 partition_sched_domains(1, NULL, NULL);
6724 break;
6725 }
6726
6727 /*
6728 * This is the last CPU online operation. So fall through and
6729 * restore the original sched domains by considering the
6730 * cpuset configurations.
6731 */
6732
6733 case CPU_ONLINE:
6734 case CPU_DOWN_FAILED:
6735 cpuset_update_active_cpus(true);
6736 break;
6737 default:
6738 return NOTIFY_DONE;
6739 }
6740 return NOTIFY_OK;
6741}
6742
6743static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6744 void *hcpu)
6745{
6746 switch (action) {
6747 case CPU_DOWN_PREPARE:
6748 cpuset_update_active_cpus(false);
6749 break;
6750 case CPU_DOWN_PREPARE_FROZEN:
6751 num_cpus_frozen++;
6752 partition_sched_domains(1, NULL, NULL);
6753 break;
6754 default:
6755 return NOTIFY_DONE;
6756 }
6757 return NOTIFY_OK;
6758}
6759
6760void __init sched_init_smp(void)
6761{
6762 cpumask_var_t non_isolated_cpus;
6763
6764 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6765 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6766
6767 sched_init_numa();
6768
6769 get_online_cpus();
6770 mutex_lock(&sched_domains_mutex);
6771 init_sched_domains(cpu_active_mask);
6772 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6773 if (cpumask_empty(non_isolated_cpus))
6774 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6775 mutex_unlock(&sched_domains_mutex);
6776 put_online_cpus();
6777
6778 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6779 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6780 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6781
6782 /* RT runtime code needs to handle some hotplug events */
6783 hotcpu_notifier(update_runtime, 0);
6784
6785 init_hrtick();
6786
6787 /* Move init over to a non-isolated CPU */
6788 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6789 BUG();
6790 sched_init_granularity();
6791 free_cpumask_var(non_isolated_cpus);
6792
6793 init_sched_rt_class();
6794}
6795#else
6796void __init sched_init_smp(void)
6797{
6798 sched_init_granularity();
6799}
6800#endif /* CONFIG_SMP */
6801
6802const_debug unsigned int sysctl_timer_migration = 1;
6803
6804int in_sched_functions(unsigned long addr)
6805{
6806 return in_lock_functions(addr) ||
6807 (addr >= (unsigned long)__sched_text_start
6808 && addr < (unsigned long)__sched_text_end);
6809}
6810
6811#ifdef CONFIG_CGROUP_SCHED
6812struct task_group root_task_group;
6813LIST_HEAD(task_groups);
6814#endif
6815
6816DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6817
6818void __init sched_init(void)
6819{
6820 int i, j;
6821 unsigned long alloc_size = 0, ptr;
6822
6823#ifdef CONFIG_FAIR_GROUP_SCHED
6824 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6825#endif
6826#ifdef CONFIG_RT_GROUP_SCHED
6827 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6828#endif
6829#ifdef CONFIG_CPUMASK_OFFSTACK
6830 alloc_size += num_possible_cpus() * cpumask_size();
6831#endif
6832 if (alloc_size) {
6833 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6834
6835#ifdef CONFIG_FAIR_GROUP_SCHED
6836 root_task_group.se = (struct sched_entity **)ptr;
6837 ptr += nr_cpu_ids * sizeof(void **);
6838
6839 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6840 ptr += nr_cpu_ids * sizeof(void **);
6841
6842#endif /* CONFIG_FAIR_GROUP_SCHED */
6843#ifdef CONFIG_RT_GROUP_SCHED
6844 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6845 ptr += nr_cpu_ids * sizeof(void **);
6846
6847 root_task_group.rt_rq = (struct rt_rq **)ptr;
6848 ptr += nr_cpu_ids * sizeof(void **);
6849
6850#endif /* CONFIG_RT_GROUP_SCHED */
6851#ifdef CONFIG_CPUMASK_OFFSTACK
6852 for_each_possible_cpu(i) {
6853 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6854 ptr += cpumask_size();
6855 }
6856#endif /* CONFIG_CPUMASK_OFFSTACK */
6857 }
6858
6859#ifdef CONFIG_SMP
6860 init_defrootdomain();
6861#endif
6862
6863 init_rt_bandwidth(&def_rt_bandwidth,
6864 global_rt_period(), global_rt_runtime());
6865
6866#ifdef CONFIG_RT_GROUP_SCHED
6867 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6868 global_rt_period(), global_rt_runtime());
6869#endif /* CONFIG_RT_GROUP_SCHED */
6870
6871#ifdef CONFIG_CGROUP_SCHED
6872 list_add(&root_task_group.list, &task_groups);
6873 INIT_LIST_HEAD(&root_task_group.children);
6874 INIT_LIST_HEAD(&root_task_group.siblings);
6875 autogroup_init(&init_task);
6876
6877#endif /* CONFIG_CGROUP_SCHED */
6878
6879#ifdef CONFIG_CGROUP_CPUACCT
6880 root_cpuacct.cpustat = &kernel_cpustat;
6881 root_cpuacct.cpuusage = alloc_percpu(u64);
6882 /* Too early, not expected to fail */
6883 BUG_ON(!root_cpuacct.cpuusage);
6884#endif
6885 for_each_possible_cpu(i) {
6886 struct rq *rq;
6887
6888 rq = cpu_rq(i);
6889 raw_spin_lock_init(&rq->lock);
6890 rq->nr_running = 0;
6891 rq->calc_load_active = 0;
6892 rq->calc_load_update = jiffies + LOAD_FREQ;
6893 init_cfs_rq(&rq->cfs);
6894 init_rt_rq(&rq->rt, rq);
6895#ifdef CONFIG_FAIR_GROUP_SCHED
6896 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6897 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6898 /*
6899 * How much cpu bandwidth does root_task_group get?
6900 *
6901 * In case of task-groups formed thr' the cgroup filesystem, it
6902 * gets 100% of the cpu resources in the system. This overall
6903 * system cpu resource is divided among the tasks of
6904 * root_task_group and its child task-groups in a fair manner,
6905 * based on each entity's (task or task-group's) weight
6906 * (se->load.weight).
6907 *
6908 * In other words, if root_task_group has 10 tasks of weight
6909 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6910 * then A0's share of the cpu resource is:
6911 *
6912 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6913 *
6914 * We achieve this by letting root_task_group's tasks sit
6915 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6916 */
6917 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6918 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6919#endif /* CONFIG_FAIR_GROUP_SCHED */
6920
6921 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6922#ifdef CONFIG_RT_GROUP_SCHED
6923 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6924 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6925#endif
6926
6927 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6928 rq->cpu_load[j] = 0;
6929
6930 rq->last_load_update_tick = jiffies;
6931
6932#ifdef CONFIG_SMP
6933 rq->sd = NULL;
6934 rq->rd = NULL;
6935 rq->cpu_power = SCHED_POWER_SCALE;
6936 rq->post_schedule = 0;
6937 rq->active_balance = 0;
6938 rq->next_balance = jiffies;
6939 rq->push_cpu = 0;
6940 rq->cpu = i;
6941 rq->online = 0;
6942 rq->idle_stamp = 0;
6943 rq->avg_idle = 2*sysctl_sched_migration_cost;
6944
6945 INIT_LIST_HEAD(&rq->cfs_tasks);
6946
6947 rq_attach_root(rq, &def_root_domain);
6948#ifdef CONFIG_NO_HZ
6949 rq->nohz_flags = 0;
6950#endif
6951#endif
6952 init_rq_hrtick(rq);
6953 atomic_set(&rq->nr_iowait, 0);
6954 }
6955
6956 set_load_weight(&init_task);
6957
6958#ifdef CONFIG_PREEMPT_NOTIFIERS
6959 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6960#endif
6961
6962#ifdef CONFIG_RT_MUTEXES
6963 plist_head_init(&init_task.pi_waiters);
6964#endif
6965
6966 /*
6967 * The boot idle thread does lazy MMU switching as well:
6968 */
6969 atomic_inc(&init_mm.mm_count);
6970 enter_lazy_tlb(&init_mm, current);
6971
6972 /*
6973 * Make us the idle thread. Technically, schedule() should not be
6974 * called from this thread, however somewhere below it might be,
6975 * but because we are the idle thread, we just pick up running again
6976 * when this runqueue becomes "idle".
6977 */
6978 init_idle(current, smp_processor_id());
6979
6980 calc_load_update = jiffies + LOAD_FREQ;
6981
6982 /*
6983 * During early bootup we pretend to be a normal task:
6984 */
6985 current->sched_class = &fair_sched_class;
6986
6987#ifdef CONFIG_SMP
6988 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6989 /* May be allocated at isolcpus cmdline parse time */
6990 if (cpu_isolated_map == NULL)
6991 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6992 idle_thread_set_boot_cpu();
6993#endif
6994 init_sched_fair_class();
6995
6996 scheduler_running = 1;
6997}
6998
6999#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7000static inline int preempt_count_equals(int preempt_offset)
7001{
7002 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7003
7004 return (nested == preempt_offset);
7005}
7006
7007void __might_sleep(const char *file, int line, int preempt_offset)
7008{
7009 static unsigned long prev_jiffy; /* ratelimiting */
7010
7011 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7012 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7013 system_state != SYSTEM_RUNNING || oops_in_progress)
7014 return;
7015 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7016 return;
7017 prev_jiffy = jiffies;
7018
7019 printk(KERN_ERR
7020 "BUG: sleeping function called from invalid context at %s:%d\n",
7021 file, line);
7022 printk(KERN_ERR
7023 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7024 in_atomic(), irqs_disabled(),
7025 current->pid, current->comm);
7026
7027 debug_show_held_locks(current);
7028 if (irqs_disabled())
7029 print_irqtrace_events(current);
7030 dump_stack();
7031}
7032EXPORT_SYMBOL(__might_sleep);
7033#endif
7034
7035#ifdef CONFIG_MAGIC_SYSRQ
7036static void normalize_task(struct rq *rq, struct task_struct *p)
7037{
7038 const struct sched_class *prev_class = p->sched_class;
7039 int old_prio = p->prio;
7040 int on_rq;
7041
7042 on_rq = p->on_rq;
7043 if (on_rq)
7044 dequeue_task(rq, p, 0);
7045 __setscheduler(rq, p, SCHED_NORMAL, 0);
7046 if (on_rq) {
7047 enqueue_task(rq, p, 0);
7048 resched_task(rq->curr);
7049 }
7050
7051 check_class_changed(rq, p, prev_class, old_prio);
7052}
7053
7054void normalize_rt_tasks(void)
7055{
7056 struct task_struct *g, *p;
7057 unsigned long flags;
7058 struct rq *rq;
7059
7060 read_lock_irqsave(&tasklist_lock, flags);
7061 do_each_thread(g, p) {
7062 /*
7063 * Only normalize user tasks:
7064 */
7065 if (!p->mm)
7066 continue;
7067
7068 p->se.exec_start = 0;
7069#ifdef CONFIG_SCHEDSTATS
7070 p->se.statistics.wait_start = 0;
7071 p->se.statistics.sleep_start = 0;
7072 p->se.statistics.block_start = 0;
7073#endif
7074
7075 if (!rt_task(p)) {
7076 /*
7077 * Renice negative nice level userspace
7078 * tasks back to 0:
7079 */
7080 if (TASK_NICE(p) < 0 && p->mm)
7081 set_user_nice(p, 0);
7082 continue;
7083 }
7084
7085 raw_spin_lock(&p->pi_lock);
7086 rq = __task_rq_lock(p);
7087
7088 normalize_task(rq, p);
7089
7090 __task_rq_unlock(rq);
7091 raw_spin_unlock(&p->pi_lock);
7092 } while_each_thread(g, p);
7093
7094 read_unlock_irqrestore(&tasklist_lock, flags);
7095}
7096
7097#endif /* CONFIG_MAGIC_SYSRQ */
7098
7099#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7100/*
7101 * These functions are only useful for the IA64 MCA handling, or kdb.
7102 *
7103 * They can only be called when the whole system has been
7104 * stopped - every CPU needs to be quiescent, and no scheduling
7105 * activity can take place. Using them for anything else would
7106 * be a serious bug, and as a result, they aren't even visible
7107 * under any other configuration.
7108 */
7109
7110/**
7111 * curr_task - return the current task for a given cpu.
7112 * @cpu: the processor in question.
7113 *
7114 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7115 */
7116struct task_struct *curr_task(int cpu)
7117{
7118 return cpu_curr(cpu);
7119}
7120
7121#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7122
7123#ifdef CONFIG_IA64
7124/**
7125 * set_curr_task - set the current task for a given cpu.
7126 * @cpu: the processor in question.
7127 * @p: the task pointer to set.
7128 *
7129 * Description: This function must only be used when non-maskable interrupts
7130 * are serviced on a separate stack. It allows the architecture to switch the
7131 * notion of the current task on a cpu in a non-blocking manner. This function
7132 * must be called with all CPU's synchronized, and interrupts disabled, the
7133 * and caller must save the original value of the current task (see
7134 * curr_task() above) and restore that value before reenabling interrupts and
7135 * re-starting the system.
7136 *
7137 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7138 */
7139void set_curr_task(int cpu, struct task_struct *p)
7140{
7141 cpu_curr(cpu) = p;
7142}
7143
7144#endif
7145
7146#ifdef CONFIG_CGROUP_SCHED
7147/* task_group_lock serializes the addition/removal of task groups */
7148static DEFINE_SPINLOCK(task_group_lock);
7149
7150static void free_sched_group(struct task_group *tg)
7151{
7152 free_fair_sched_group(tg);
7153 free_rt_sched_group(tg);
7154 autogroup_free(tg);
7155 kfree(tg);
7156}
7157
7158/* allocate runqueue etc for a new task group */
7159struct task_group *sched_create_group(struct task_group *parent)
7160{
7161 struct task_group *tg;
7162 unsigned long flags;
7163
7164 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7165 if (!tg)
7166 return ERR_PTR(-ENOMEM);
7167
7168 if (!alloc_fair_sched_group(tg, parent))
7169 goto err;
7170
7171 if (!alloc_rt_sched_group(tg, parent))
7172 goto err;
7173
7174 spin_lock_irqsave(&task_group_lock, flags);
7175 list_add_rcu(&tg->list, &task_groups);
7176
7177 WARN_ON(!parent); /* root should already exist */
7178
7179 tg->parent = parent;
7180 INIT_LIST_HEAD(&tg->children);
7181 list_add_rcu(&tg->siblings, &parent->children);
7182 spin_unlock_irqrestore(&task_group_lock, flags);
7183
7184 return tg;
7185
7186err:
7187 free_sched_group(tg);
7188 return ERR_PTR(-ENOMEM);
7189}
7190
7191/* rcu callback to free various structures associated with a task group */
7192static void free_sched_group_rcu(struct rcu_head *rhp)
7193{
7194 /* now it should be safe to free those cfs_rqs */
7195 free_sched_group(container_of(rhp, struct task_group, rcu));
7196}
7197
7198/* Destroy runqueue etc associated with a task group */
7199void sched_destroy_group(struct task_group *tg)
7200{
7201 unsigned long flags;
7202 int i;
7203
7204 /* end participation in shares distribution */
7205 for_each_possible_cpu(i)
7206 unregister_fair_sched_group(tg, i);
7207
7208 spin_lock_irqsave(&task_group_lock, flags);
7209 list_del_rcu(&tg->list);
7210 list_del_rcu(&tg->siblings);
7211 spin_unlock_irqrestore(&task_group_lock, flags);
7212
7213 /* wait for possible concurrent references to cfs_rqs complete */
7214 call_rcu(&tg->rcu, free_sched_group_rcu);
7215}
7216
7217/* change task's runqueue when it moves between groups.
7218 * The caller of this function should have put the task in its new group
7219 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7220 * reflect its new group.
7221 */
7222void sched_move_task(struct task_struct *tsk)
7223{
7224 struct task_group *tg;
7225 int on_rq, running;
7226 unsigned long flags;
7227 struct rq *rq;
7228
7229 rq = task_rq_lock(tsk, &flags);
7230
7231 running = task_current(rq, tsk);
7232 on_rq = tsk->on_rq;
7233
7234 if (on_rq)
7235 dequeue_task(rq, tsk, 0);
7236 if (unlikely(running))
7237 tsk->sched_class->put_prev_task(rq, tsk);
7238
7239 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7240 lockdep_is_held(&tsk->sighand->siglock)),
7241 struct task_group, css);
7242 tg = autogroup_task_group(tsk, tg);
7243 tsk->sched_task_group = tg;
7244
7245#ifdef CONFIG_FAIR_GROUP_SCHED
7246 if (tsk->sched_class->task_move_group)
7247 tsk->sched_class->task_move_group(tsk, on_rq);
7248 else
7249#endif
7250 set_task_rq(tsk, task_cpu(tsk));
7251
7252 if (unlikely(running))
7253 tsk->sched_class->set_curr_task(rq);
7254 if (on_rq)
7255 enqueue_task(rq, tsk, 0);
7256
7257 task_rq_unlock(rq, tsk, &flags);
7258}
7259#endif /* CONFIG_CGROUP_SCHED */
7260
7261#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7262static unsigned long to_ratio(u64 period, u64 runtime)
7263{
7264 if (runtime == RUNTIME_INF)
7265 return 1ULL << 20;
7266
7267 return div64_u64(runtime << 20, period);
7268}
7269#endif
7270
7271#ifdef CONFIG_RT_GROUP_SCHED
7272/*
7273 * Ensure that the real time constraints are schedulable.
7274 */
7275static DEFINE_MUTEX(rt_constraints_mutex);
7276
7277/* Must be called with tasklist_lock held */
7278static inline int tg_has_rt_tasks(struct task_group *tg)
7279{
7280 struct task_struct *g, *p;
7281
7282 do_each_thread(g, p) {
7283 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7284 return 1;
7285 } while_each_thread(g, p);
7286
7287 return 0;
7288}
7289
7290struct rt_schedulable_data {
7291 struct task_group *tg;
7292 u64 rt_period;
7293 u64 rt_runtime;
7294};
7295
7296static int tg_rt_schedulable(struct task_group *tg, void *data)
7297{
7298 struct rt_schedulable_data *d = data;
7299 struct task_group *child;
7300 unsigned long total, sum = 0;
7301 u64 period, runtime;
7302
7303 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7304 runtime = tg->rt_bandwidth.rt_runtime;
7305
7306 if (tg == d->tg) {
7307 period = d->rt_period;
7308 runtime = d->rt_runtime;
7309 }
7310
7311 /*
7312 * Cannot have more runtime than the period.
7313 */
7314 if (runtime > period && runtime != RUNTIME_INF)
7315 return -EINVAL;
7316
7317 /*
7318 * Ensure we don't starve existing RT tasks.
7319 */
7320 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7321 return -EBUSY;
7322
7323 total = to_ratio(period, runtime);
7324
7325 /*
7326 * Nobody can have more than the global setting allows.
7327 */
7328 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7329 return -EINVAL;
7330
7331 /*
7332 * The sum of our children's runtime should not exceed our own.
7333 */
7334 list_for_each_entry_rcu(child, &tg->children, siblings) {
7335 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7336 runtime = child->rt_bandwidth.rt_runtime;
7337
7338 if (child == d->tg) {
7339 period = d->rt_period;
7340 runtime = d->rt_runtime;
7341 }
7342
7343 sum += to_ratio(period, runtime);
7344 }
7345
7346 if (sum > total)
7347 return -EINVAL;
7348
7349 return 0;
7350}
7351
7352static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7353{
7354 int ret;
7355
7356 struct rt_schedulable_data data = {
7357 .tg = tg,
7358 .rt_period = period,
7359 .rt_runtime = runtime,
7360 };
7361
7362 rcu_read_lock();
7363 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7364 rcu_read_unlock();
7365
7366 return ret;
7367}
7368
7369static int tg_set_rt_bandwidth(struct task_group *tg,
7370 u64 rt_period, u64 rt_runtime)
7371{
7372 int i, err = 0;
7373
7374 mutex_lock(&rt_constraints_mutex);
7375 read_lock(&tasklist_lock);
7376 err = __rt_schedulable(tg, rt_period, rt_runtime);
7377 if (err)
7378 goto unlock;
7379
7380 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7381 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7382 tg->rt_bandwidth.rt_runtime = rt_runtime;
7383
7384 for_each_possible_cpu(i) {
7385 struct rt_rq *rt_rq = tg->rt_rq[i];
7386
7387 raw_spin_lock(&rt_rq->rt_runtime_lock);
7388 rt_rq->rt_runtime = rt_runtime;
7389 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7390 }
7391 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7392unlock:
7393 read_unlock(&tasklist_lock);
7394 mutex_unlock(&rt_constraints_mutex);
7395
7396 return err;
7397}
7398
7399int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7400{
7401 u64 rt_runtime, rt_period;
7402
7403 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7404 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7405 if (rt_runtime_us < 0)
7406 rt_runtime = RUNTIME_INF;
7407
7408 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7409}
7410
7411long sched_group_rt_runtime(struct task_group *tg)
7412{
7413 u64 rt_runtime_us;
7414
7415 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7416 return -1;
7417
7418 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7419 do_div(rt_runtime_us, NSEC_PER_USEC);
7420 return rt_runtime_us;
7421}
7422
7423int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7424{
7425 u64 rt_runtime, rt_period;
7426
7427 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7428 rt_runtime = tg->rt_bandwidth.rt_runtime;
7429
7430 if (rt_period == 0)
7431 return -EINVAL;
7432
7433 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7434}
7435
7436long sched_group_rt_period(struct task_group *tg)
7437{
7438 u64 rt_period_us;
7439
7440 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7441 do_div(rt_period_us, NSEC_PER_USEC);
7442 return rt_period_us;
7443}
7444
7445static int sched_rt_global_constraints(void)
7446{
7447 u64 runtime, period;
7448 int ret = 0;
7449
7450 if (sysctl_sched_rt_period <= 0)
7451 return -EINVAL;
7452
7453 runtime = global_rt_runtime();
7454 period = global_rt_period();
7455
7456 /*
7457 * Sanity check on the sysctl variables.
7458 */
7459 if (runtime > period && runtime != RUNTIME_INF)
7460 return -EINVAL;
7461
7462 mutex_lock(&rt_constraints_mutex);
7463 read_lock(&tasklist_lock);
7464 ret = __rt_schedulable(NULL, 0, 0);
7465 read_unlock(&tasklist_lock);
7466 mutex_unlock(&rt_constraints_mutex);
7467
7468 return ret;
7469}
7470
7471int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7472{
7473 /* Don't accept realtime tasks when there is no way for them to run */
7474 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7475 return 0;
7476
7477 return 1;
7478}
7479
7480#else /* !CONFIG_RT_GROUP_SCHED */
7481static int sched_rt_global_constraints(void)
7482{
7483 unsigned long flags;
7484 int i;
7485
7486 if (sysctl_sched_rt_period <= 0)
7487 return -EINVAL;
7488
7489 /*
7490 * There's always some RT tasks in the root group
7491 * -- migration, kstopmachine etc..
7492 */
7493 if (sysctl_sched_rt_runtime == 0)
7494 return -EBUSY;
7495
7496 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7497 for_each_possible_cpu(i) {
7498 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7499
7500 raw_spin_lock(&rt_rq->rt_runtime_lock);
7501 rt_rq->rt_runtime = global_rt_runtime();
7502 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7503 }
7504 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7505
7506 return 0;
7507}
7508#endif /* CONFIG_RT_GROUP_SCHED */
7509
7510int sched_rt_handler(struct ctl_table *table, int write,
7511 void __user *buffer, size_t *lenp,
7512 loff_t *ppos)
7513{
7514 int ret;
7515 int old_period, old_runtime;
7516 static DEFINE_MUTEX(mutex);
7517
7518 mutex_lock(&mutex);
7519 old_period = sysctl_sched_rt_period;
7520 old_runtime = sysctl_sched_rt_runtime;
7521
7522 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7523
7524 if (!ret && write) {
7525 ret = sched_rt_global_constraints();
7526 if (ret) {
7527 sysctl_sched_rt_period = old_period;
7528 sysctl_sched_rt_runtime = old_runtime;
7529 } else {
7530 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7531 def_rt_bandwidth.rt_period =
7532 ns_to_ktime(global_rt_period());
7533 }
7534 }
7535 mutex_unlock(&mutex);
7536
7537 return ret;
7538}
7539
7540#ifdef CONFIG_CGROUP_SCHED
7541
7542/* return corresponding task_group object of a cgroup */
7543static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7544{
7545 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7546 struct task_group, css);
7547}
7548
7549static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7550{
7551 struct task_group *tg, *parent;
7552
7553 if (!cgrp->parent) {
7554 /* This is early initialization for the top cgroup */
7555 return &root_task_group.css;
7556 }
7557
7558 parent = cgroup_tg(cgrp->parent);
7559 tg = sched_create_group(parent);
7560 if (IS_ERR(tg))
7561 return ERR_PTR(-ENOMEM);
7562
7563 return &tg->css;
7564}
7565
7566static void cpu_cgroup_css_free(struct cgroup *cgrp)
7567{
7568 struct task_group *tg = cgroup_tg(cgrp);
7569
7570 sched_destroy_group(tg);
7571}
7572
7573static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7574 struct cgroup_taskset *tset)
7575{
7576 struct task_struct *task;
7577
7578 cgroup_taskset_for_each(task, cgrp, tset) {
7579#ifdef CONFIG_RT_GROUP_SCHED
7580 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7581 return -EINVAL;
7582#else
7583 /* We don't support RT-tasks being in separate groups */
7584 if (task->sched_class != &fair_sched_class)
7585 return -EINVAL;
7586#endif
7587 }
7588 return 0;
7589}
7590
7591static void cpu_cgroup_attach(struct cgroup *cgrp,
7592 struct cgroup_taskset *tset)
7593{
7594 struct task_struct *task;
7595
7596 cgroup_taskset_for_each(task, cgrp, tset)
7597 sched_move_task(task);
7598}
7599
7600static void
7601cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7602 struct task_struct *task)
7603{
7604 /*
7605 * cgroup_exit() is called in the copy_process() failure path.
7606 * Ignore this case since the task hasn't ran yet, this avoids
7607 * trying to poke a half freed task state from generic code.
7608 */
7609 if (!(task->flags & PF_EXITING))
7610 return;
7611
7612 sched_move_task(task);
7613}
7614
7615#ifdef CONFIG_FAIR_GROUP_SCHED
7616static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7617 u64 shareval)
7618{
7619 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7620}
7621
7622static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7623{
7624 struct task_group *tg = cgroup_tg(cgrp);
7625
7626 return (u64) scale_load_down(tg->shares);
7627}
7628
7629#ifdef CONFIG_CFS_BANDWIDTH
7630static DEFINE_MUTEX(cfs_constraints_mutex);
7631
7632const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7633const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7634
7635static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7636
7637static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7638{
7639 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7640 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7641
7642 if (tg == &root_task_group)
7643 return -EINVAL;
7644
7645 /*
7646 * Ensure we have at some amount of bandwidth every period. This is
7647 * to prevent reaching a state of large arrears when throttled via
7648 * entity_tick() resulting in prolonged exit starvation.
7649 */
7650 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7651 return -EINVAL;
7652
7653 /*
7654 * Likewise, bound things on the otherside by preventing insane quota
7655 * periods. This also allows us to normalize in computing quota
7656 * feasibility.
7657 */
7658 if (period > max_cfs_quota_period)
7659 return -EINVAL;
7660
7661 mutex_lock(&cfs_constraints_mutex);
7662 ret = __cfs_schedulable(tg, period, quota);
7663 if (ret)
7664 goto out_unlock;
7665
7666 runtime_enabled = quota != RUNTIME_INF;
7667 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7668 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7669 raw_spin_lock_irq(&cfs_b->lock);
7670 cfs_b->period = ns_to_ktime(period);
7671 cfs_b->quota = quota;
7672
7673 __refill_cfs_bandwidth_runtime(cfs_b);
7674 /* restart the period timer (if active) to handle new period expiry */
7675 if (runtime_enabled && cfs_b->timer_active) {
7676 /* force a reprogram */
7677 cfs_b->timer_active = 0;
7678 __start_cfs_bandwidth(cfs_b);
7679 }
7680 raw_spin_unlock_irq(&cfs_b->lock);
7681
7682 for_each_possible_cpu(i) {
7683 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7684 struct rq *rq = cfs_rq->rq;
7685
7686 raw_spin_lock_irq(&rq->lock);
7687 cfs_rq->runtime_enabled = runtime_enabled;
7688 cfs_rq->runtime_remaining = 0;
7689
7690 if (cfs_rq->throttled)
7691 unthrottle_cfs_rq(cfs_rq);
7692 raw_spin_unlock_irq(&rq->lock);
7693 }
7694out_unlock:
7695 mutex_unlock(&cfs_constraints_mutex);
7696
7697 return ret;
7698}
7699
7700int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7701{
7702 u64 quota, period;
7703
7704 period = ktime_to_ns(tg->cfs_bandwidth.period);
7705 if (cfs_quota_us < 0)
7706 quota = RUNTIME_INF;
7707 else
7708 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7709
7710 return tg_set_cfs_bandwidth(tg, period, quota);
7711}
7712
7713long tg_get_cfs_quota(struct task_group *tg)
7714{
7715 u64 quota_us;
7716
7717 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7718 return -1;
7719
7720 quota_us = tg->cfs_bandwidth.quota;
7721 do_div(quota_us, NSEC_PER_USEC);
7722
7723 return quota_us;
7724}
7725
7726int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7727{
7728 u64 quota, period;
7729
7730 period = (u64)cfs_period_us * NSEC_PER_USEC;
7731 quota = tg->cfs_bandwidth.quota;
7732
7733 return tg_set_cfs_bandwidth(tg, period, quota);
7734}
7735
7736long tg_get_cfs_period(struct task_group *tg)
7737{
7738 u64 cfs_period_us;
7739
7740 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7741 do_div(cfs_period_us, NSEC_PER_USEC);
7742
7743 return cfs_period_us;
7744}
7745
7746static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7747{
7748 return tg_get_cfs_quota(cgroup_tg(cgrp));
7749}
7750
7751static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7752 s64 cfs_quota_us)
7753{
7754 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7755}
7756
7757static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7758{
7759 return tg_get_cfs_period(cgroup_tg(cgrp));
7760}
7761
7762static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7763 u64 cfs_period_us)
7764{
7765 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7766}
7767
7768struct cfs_schedulable_data {
7769 struct task_group *tg;
7770 u64 period, quota;
7771};
7772
7773/*
7774 * normalize group quota/period to be quota/max_period
7775 * note: units are usecs
7776 */
7777static u64 normalize_cfs_quota(struct task_group *tg,
7778 struct cfs_schedulable_data *d)
7779{
7780 u64 quota, period;
7781
7782 if (tg == d->tg) {
7783 period = d->period;
7784 quota = d->quota;
7785 } else {
7786 period = tg_get_cfs_period(tg);
7787 quota = tg_get_cfs_quota(tg);
7788 }
7789
7790 /* note: these should typically be equivalent */
7791 if (quota == RUNTIME_INF || quota == -1)
7792 return RUNTIME_INF;
7793
7794 return to_ratio(period, quota);
7795}
7796
7797static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7798{
7799 struct cfs_schedulable_data *d = data;
7800 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7801 s64 quota = 0, parent_quota = -1;
7802
7803 if (!tg->parent) {
7804 quota = RUNTIME_INF;
7805 } else {
7806 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7807
7808 quota = normalize_cfs_quota(tg, d);
7809 parent_quota = parent_b->hierarchal_quota;
7810
7811 /*
7812 * ensure max(child_quota) <= parent_quota, inherit when no
7813 * limit is set
7814 */
7815 if (quota == RUNTIME_INF)
7816 quota = parent_quota;
7817 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7818 return -EINVAL;
7819 }
7820 cfs_b->hierarchal_quota = quota;
7821
7822 return 0;
7823}
7824
7825static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7826{
7827 int ret;
7828 struct cfs_schedulable_data data = {
7829 .tg = tg,
7830 .period = period,
7831 .quota = quota,
7832 };
7833
7834 if (quota != RUNTIME_INF) {
7835 do_div(data.period, NSEC_PER_USEC);
7836 do_div(data.quota, NSEC_PER_USEC);
7837 }
7838
7839 rcu_read_lock();
7840 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7841 rcu_read_unlock();
7842
7843 return ret;
7844}
7845
7846static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7847 struct cgroup_map_cb *cb)
7848{
7849 struct task_group *tg = cgroup_tg(cgrp);
7850 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7851
7852 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7853 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7854 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7855
7856 return 0;
7857}
7858#endif /* CONFIG_CFS_BANDWIDTH */
7859#endif /* CONFIG_FAIR_GROUP_SCHED */
7860
7861#ifdef CONFIG_RT_GROUP_SCHED
7862static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7863 s64 val)
7864{
7865 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7866}
7867
7868static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7869{
7870 return sched_group_rt_runtime(cgroup_tg(cgrp));
7871}
7872
7873static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7874 u64 rt_period_us)
7875{
7876 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7877}
7878
7879static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7880{
7881 return sched_group_rt_period(cgroup_tg(cgrp));
7882}
7883#endif /* CONFIG_RT_GROUP_SCHED */
7884
7885static struct cftype cpu_files[] = {
7886#ifdef CONFIG_FAIR_GROUP_SCHED
7887 {
7888 .name = "shares",
7889 .read_u64 = cpu_shares_read_u64,
7890 .write_u64 = cpu_shares_write_u64,
7891 },
7892#endif
7893#ifdef CONFIG_CFS_BANDWIDTH
7894 {
7895 .name = "cfs_quota_us",
7896 .read_s64 = cpu_cfs_quota_read_s64,
7897 .write_s64 = cpu_cfs_quota_write_s64,
7898 },
7899 {
7900 .name = "cfs_period_us",
7901 .read_u64 = cpu_cfs_period_read_u64,
7902 .write_u64 = cpu_cfs_period_write_u64,
7903 },
7904 {
7905 .name = "stat",
7906 .read_map = cpu_stats_show,
7907 },
7908#endif
7909#ifdef CONFIG_RT_GROUP_SCHED
7910 {
7911 .name = "rt_runtime_us",
7912 .read_s64 = cpu_rt_runtime_read,
7913 .write_s64 = cpu_rt_runtime_write,
7914 },
7915 {
7916 .name = "rt_period_us",
7917 .read_u64 = cpu_rt_period_read_uint,
7918 .write_u64 = cpu_rt_period_write_uint,
7919 },
7920#endif
7921 { } /* terminate */
7922};
7923
7924struct cgroup_subsys cpu_cgroup_subsys = {
7925 .name = "cpu",
7926 .css_alloc = cpu_cgroup_css_alloc,
7927 .css_free = cpu_cgroup_css_free,
7928 .can_attach = cpu_cgroup_can_attach,
7929 .attach = cpu_cgroup_attach,
7930 .exit = cpu_cgroup_exit,
7931 .subsys_id = cpu_cgroup_subsys_id,
7932 .base_cftypes = cpu_files,
7933 .early_init = 1,
7934};
7935
7936#endif /* CONFIG_CGROUP_SCHED */
7937
7938#ifdef CONFIG_CGROUP_CPUACCT
7939
7940/*
7941 * CPU accounting code for task groups.
7942 *
7943 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7944 * (balbir@in.ibm.com).
7945 */
7946
7947struct cpuacct root_cpuacct;
7948
7949/* create a new cpu accounting group */
7950static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7951{
7952 struct cpuacct *ca;
7953
7954 if (!cgrp->parent)
7955 return &root_cpuacct.css;
7956
7957 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7958 if (!ca)
7959 goto out;
7960
7961 ca->cpuusage = alloc_percpu(u64);
7962 if (!ca->cpuusage)
7963 goto out_free_ca;
7964
7965 ca->cpustat = alloc_percpu(struct kernel_cpustat);
7966 if (!ca->cpustat)
7967 goto out_free_cpuusage;
7968
7969 return &ca->css;
7970
7971out_free_cpuusage:
7972 free_percpu(ca->cpuusage);
7973out_free_ca:
7974 kfree(ca);
7975out:
7976 return ERR_PTR(-ENOMEM);
7977}
7978
7979/* destroy an existing cpu accounting group */
7980static void cpuacct_css_free(struct cgroup *cgrp)
7981{
7982 struct cpuacct *ca = cgroup_ca(cgrp);
7983
7984 free_percpu(ca->cpustat);
7985 free_percpu(ca->cpuusage);
7986 kfree(ca);
7987}
7988
7989static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
7990{
7991 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7992 u64 data;
7993
7994#ifndef CONFIG_64BIT
7995 /*
7996 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
7997 */
7998 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7999 data = *cpuusage;
8000 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8001#else
8002 data = *cpuusage;
8003#endif
8004
8005 return data;
8006}
8007
8008static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8009{
8010 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8011
8012#ifndef CONFIG_64BIT
8013 /*
8014 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8015 */
8016 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8017 *cpuusage = val;
8018 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8019#else
8020 *cpuusage = val;
8021#endif
8022}
8023
8024/* return total cpu usage (in nanoseconds) of a group */
8025static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8026{
8027 struct cpuacct *ca = cgroup_ca(cgrp);
8028 u64 totalcpuusage = 0;
8029 int i;
8030
8031 for_each_present_cpu(i)
8032 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8033
8034 return totalcpuusage;
8035}
8036
8037static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8038 u64 reset)
8039{
8040 struct cpuacct *ca = cgroup_ca(cgrp);
8041 int err = 0;
8042 int i;
8043
8044 if (reset) {
8045 err = -EINVAL;
8046 goto out;
8047 }
8048
8049 for_each_present_cpu(i)
8050 cpuacct_cpuusage_write(ca, i, 0);
8051
8052out:
8053 return err;
8054}
8055
8056static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8057 struct seq_file *m)
8058{
8059 struct cpuacct *ca = cgroup_ca(cgroup);
8060 u64 percpu;
8061 int i;
8062
8063 for_each_present_cpu(i) {
8064 percpu = cpuacct_cpuusage_read(ca, i);
8065 seq_printf(m, "%llu ", (unsigned long long) percpu);
8066 }
8067 seq_printf(m, "\n");
8068 return 0;
8069}
8070
8071static const char *cpuacct_stat_desc[] = {
8072 [CPUACCT_STAT_USER] = "user",
8073 [CPUACCT_STAT_SYSTEM] = "system",
8074};
8075
8076static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8077 struct cgroup_map_cb *cb)
8078{
8079 struct cpuacct *ca = cgroup_ca(cgrp);
8080 int cpu;
8081 s64 val = 0;
8082
8083 for_each_online_cpu(cpu) {
8084 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8085 val += kcpustat->cpustat[CPUTIME_USER];
8086 val += kcpustat->cpustat[CPUTIME_NICE];
8087 }
8088 val = cputime64_to_clock_t(val);
8089 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8090
8091 val = 0;
8092 for_each_online_cpu(cpu) {
8093 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8094 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8095 val += kcpustat->cpustat[CPUTIME_IRQ];
8096 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8097 }
8098
8099 val = cputime64_to_clock_t(val);
8100 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8101
8102 return 0;
8103}
8104
8105static struct cftype files[] = {
8106 {
8107 .name = "usage",
8108 .read_u64 = cpuusage_read,
8109 .write_u64 = cpuusage_write,
8110 },
8111 {
8112 .name = "usage_percpu",
8113 .read_seq_string = cpuacct_percpu_seq_read,
8114 },
8115 {
8116 .name = "stat",
8117 .read_map = cpuacct_stats_show,
8118 },
8119 { } /* terminate */
8120};
8121
8122/*
8123 * charge this task's execution time to its accounting group.
8124 *
8125 * called with rq->lock held.
8126 */
8127void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8128{
8129 struct cpuacct *ca;
8130 int cpu;
8131
8132 if (unlikely(!cpuacct_subsys.active))
8133 return;
8134
8135 cpu = task_cpu(tsk);
8136
8137 rcu_read_lock();
8138
8139 ca = task_ca(tsk);
8140
8141 for (; ca; ca = parent_ca(ca)) {
8142 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8143 *cpuusage += cputime;
8144 }
8145
8146 rcu_read_unlock();
8147}
8148
8149struct cgroup_subsys cpuacct_subsys = {
8150 .name = "cpuacct",
8151 .css_alloc = cpuacct_css_alloc,
8152 .css_free = cpuacct_css_free,
8153 .subsys_id = cpuacct_subsys_id,
8154 .base_cftypes = files,
8155};
8156#endif /* CONFIG_CGROUP_CPUACCT */
8157
8158void dump_cpu_task(int cpu)
8159{
8160 pr_info("Task dump for CPU %d:\n", cpu);
8161 sched_show_task(cpu_curr(cpu));
8162}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
deleted file mode 100644
index 23aa789c53e..00000000000
--- a/kernel/sched/cpupri.c
+++ /dev/null
@@ -1,240 +0,0 @@
1/*
2 * kernel/sched/cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include <linux/gfp.h>
31#include "cpupri.h"
32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio)
35{
36 int cpupri;
37
38 if (prio == CPUPRI_INVALID)
39 cpupri = CPUPRI_INVALID;
40 else if (prio == MAX_PRIO)
41 cpupri = CPUPRI_IDLE;
42 else if (prio >= MAX_RT_PRIO)
43 cpupri = CPUPRI_NORMAL;
44 else
45 cpupri = MAX_RT_PRIO - prio + 1;
46
47 return cpupri;
48}
49
50/**
51 * cpupri_find - find the best (lowest-pri) CPU in the system
52 * @cp: The cpupri context
53 * @p: The task
54 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
55 *
56 * Note: This function returns the recommended CPUs as calculated during the
57 * current invocation. By the time the call returns, the CPUs may have in
58 * fact changed priorities any number of times. While not ideal, it is not
59 * an issue of correctness since the normal rebalancer logic will correct
60 * any discrepancies created by racing against the uncertainty of the current
61 * priority configuration.
62 *
63 * Returns: (int)bool - CPUs were found
64 */
65int cpupri_find(struct cpupri *cp, struct task_struct *p,
66 struct cpumask *lowest_mask)
67{
68 int idx = 0;
69 int task_pri = convert_prio(p->prio);
70
71 if (task_pri >= MAX_RT_PRIO)
72 return 0;
73
74 for (idx = 0; idx < task_pri; idx++) {
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
103
104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
105 continue;
106
107 if (lowest_mask) {
108 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
109
110 /*
111 * We have to ensure that we have at least one bit
112 * still set in the array, since the map could have
113 * been concurrently emptied between the first and
114 * second reads of vec->mask. If we hit this
115 * condition, simply act as though we never hit this
116 * priority level and continue on.
117 */
118 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
119 continue;
120 }
121
122 return 1;
123 }
124
125 return 0;
126}
127
128/**
129 * cpupri_set - update the cpu priority setting
130 * @cp: The cpupri context
131 * @cpu: The target cpu
132 * @newpri: The priority (INVALID-RT99) to assign to this CPU
133 *
134 * Note: Assumes cpu_rq(cpu)->lock is locked
135 *
136 * Returns: (void)
137 */
138void cpupri_set(struct cpupri *cp, int cpu, int newpri)
139{
140 int *currpri = &cp->cpu_to_pri[cpu];
141 int oldpri = *currpri;
142 int do_mb = 0;
143
144 newpri = convert_prio(newpri);
145
146 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
147
148 if (newpri == oldpri)
149 return;
150
151 /*
152 * If the cpu was currently mapped to a different value, we
153 * need to map it to the new value then remove the old value.
154 * Note, we must add the new value first, otherwise we risk the
155 * cpu being missed by the priority loop in cpupri_find.
156 */
157 if (likely(newpri != CPUPRI_INVALID)) {
158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
159
160 cpumask_set_cpu(cpu, vec->mask);
161 /*
162 * When adding a new vector, we update the mask first,
163 * do a write memory barrier, and then update the count, to
164 * make sure the vector is visible when count is set.
165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
169 }
170 if (likely(oldpri != CPUPRI_INVALID)) {
171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
172
173 /*
174 * Because the order of modification of the vec->count
175 * is important, we must make sure that the update
176 * of the new prio is seen before we decrement the
177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
194 cpumask_clear_cpu(cpu, vec->mask);
195 }
196
197 *currpri = newpri;
198}
199
200/**
201 * cpupri_init - initialize the cpupri structure
202 * @cp: The cpupri context
203 *
204 * Returns: -ENOMEM if memory fails.
205 */
206int cpupri_init(struct cpupri *cp)
207{
208 int i;
209
210 memset(cp, 0, sizeof(*cp));
211
212 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
213 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
214
215 atomic_set(&vec->count, 0);
216 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
217 goto cleanup;
218 }
219
220 for_each_possible_cpu(i)
221 cp->cpu_to_pri[i] = CPUPRI_INVALID;
222 return 0;
223
224cleanup:
225 for (i--; i >= 0; i--)
226 free_cpumask_var(cp->pri_to_cpu[i].mask);
227 return -ENOMEM;
228}
229
230/**
231 * cpupri_cleanup - clean up the cpupri structure
232 * @cp: The cpupri context
233 */
234void cpupri_cleanup(struct cpupri *cp)
235{
236 int i;
237
238 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
239 free_cpumask_var(cp->pri_to_cpu[i].mask);
240}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
deleted file mode 100644
index f6d75617349..00000000000
--- a/kernel/sched/cpupri.h
+++ /dev/null
@@ -1,34 +0,0 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7
8#define CPUPRI_INVALID -1
9#define CPUPRI_IDLE 0
10#define CPUPRI_NORMAL 1
11/* values 2-101 are RT priorities 0-99 */
12
13struct cpupri_vec {
14 atomic_t count;
15 cpumask_var_t mask;
16};
17
18struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS];
21};
22
23#ifdef CONFIG_SMP
24int cpupri_find(struct cpupri *cp,
25 struct task_struct *p, struct cpumask *lowest_mask);
26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif
33
34#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
deleted file mode 100644
index 293b202fcf7..00000000000
--- a/kernel/sched/cputime.c
+++ /dev/null
@@ -1,589 +0,0 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void irqtime_account_irq(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/*
323 * Account a tick to a process and cpustat
324 * @p: the process that the cpu time gets accounted to
325 * @user_tick: is the tick from userspace
326 * @rq: the pointer to rq
327 *
328 * Tick demultiplexing follows the order
329 * - pending hardirq update
330 * - pending softirq update
331 * - user_time
332 * - idle_time
333 * - system time
334 * - check for guest_time
335 * - else account as system_time
336 *
337 * Check for hardirq is done both for system and user time as there is
338 * no timer going off while we are on hardirq and hence we may never get an
339 * opportunity to update it solely in system time.
340 * p->stime and friends are only updated on system time and not on irq
341 * softirq as those do not count in task exec_runtime any more.
342 */
343static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
344 struct rq *rq)
345{
346 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
347 u64 *cpustat = kcpustat_this_cpu->cpustat;
348
349 if (steal_account_process_tick())
350 return;
351
352 if (irqtime_account_hi_update()) {
353 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
354 } else if (irqtime_account_si_update()) {
355 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
356 } else if (this_cpu_ksoftirqd() == p) {
357 /*
358 * ksoftirqd time do not get accounted in cpu_softirq_time.
359 * So, we have to handle it separately here.
360 * Also, p->stime needs to be updated for ksoftirqd.
361 */
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
363 CPUTIME_SOFTIRQ);
364 } else if (user_tick) {
365 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
366 } else if (p == rq->idle) {
367 account_idle_time(cputime_one_jiffy);
368 } else if (p->flags & PF_VCPU) { /* System time or guest time */
369 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
370 } else {
371 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
372 CPUTIME_SYSTEM);
373 }
374}
375
376static void irqtime_account_idle_ticks(int ticks)
377{
378 int i;
379 struct rq *rq = this_rq();
380
381 for (i = 0; i < ticks; i++)
382 irqtime_account_process_tick(current, 0, rq);
383}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389
390/*
391 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to
393 * @user_tick: indicates if the tick is a user or a system tick
394 */
395void account_process_tick(struct task_struct *p, int user_tick)
396{
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq();
399
400 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq);
402 return;
403 }
404
405 if (steal_account_process_tick())
406 return;
407
408 if (user_tick)
409 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
410 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
411 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
412 one_jiffy_scaled);
413 else
414 account_idle_time(cputime_one_jiffy);
415}
416
417/*
418 * Account multiple ticks of steal time.
419 * @p: the process from which the cpu time has been stolen
420 * @ticks: number of stolen ticks
421 */
422void account_steal_ticks(unsigned long ticks)
423{
424 account_steal_time(jiffies_to_cputime(ticks));
425}
426
427/*
428 * Account multiple ticks of idle time.
429 * @ticks: number of stolen ticks
430 */
431void account_idle_ticks(unsigned long ticks)
432{
433
434 if (sched_clock_irqtime) {
435 irqtime_account_idle_ticks(ticks);
436 return;
437 }
438
439 account_idle_time(jiffies_to_cputime(ticks));
440}
441
442#endif
443
444/*
445 * Use precise platform statistics if available:
446 */
447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
449{
450 *ut = p->utime;
451 *st = p->stime;
452}
453
454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
455{
456 struct task_cputime cputime;
457
458 thread_group_cputime(p, &cputime);
459
460 *ut = cputime.utime;
461 *st = cputime.stime;
462}
463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
487/*
488 * Archs that account the whole time spent in the idle task
489 * (outside irq) as idle time can rely on this and just implement
490 * vtime_account_system() and vtime_account_idle(). Archs that
491 * have other meaning of the idle time (s390 only includes the
492 * time spent by the CPU when it's in low power mode) must override
493 * vtime_account().
494 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk)
497{
498 if (in_interrupt() || !is_idle_task(tsk))
499 vtime_account_system(tsk);
500 else
501 vtime_account_idle(tsk);
502}
503EXPORT_SYMBOL_GPL(vtime_account);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505
506#else
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
513{
514 u64 temp = (__force u64) rtime;
515
516 temp *= (__force u64) utime;
517
518 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total);
520 else
521 temp = div64_u64(temp, (__force u64) total);
522
523 return (__force cputime_t) temp;
524}
525
526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
533{
534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
538
539 /*
540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
548 */
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550
551 if (total)
552 utime = scale_utime(utime, rtime, total);
553 else
554 utime = rtime;
555
556 /*
557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
560 */
561 prev->utime = max(prev->utime, utime);
562 prev->stime = max(prev->stime, rtime - prev->utime);
563
564 *ut = prev->utime;
565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577}
578
579/*
580 * Must be called with siglock held.
581 */
582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
583{
584 struct task_cputime cputime;
585
586 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588}
589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
deleted file mode 100644
index 2cd3c1b4e58..00000000000
--- a/kernel/sched/debug.c
+++ /dev/null
@@ -1,531 +0,0 @@
1/*
2 * kernel/sched/debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19#include "sched.h"
20
21static DEFINE_SPINLOCK(sched_debug_lock);
22
23/*
24 * This allows printing both to /proc/sched_debug and
25 * to the console
26 */
27#define SEQ_printf(m, x...) \
28 do { \
29 if (m) \
30 seq_printf(m, x); \
31 else \
32 printk(x); \
33 } while (0)
34
35/*
36 * Ease the printing of nsec fields:
37 */
38static long long nsec_high(unsigned long long nsec)
39{
40 if ((long long)nsec < 0) {
41 nsec = -nsec;
42 do_div(nsec, 1000000);
43 return -nsec;
44 }
45 do_div(nsec, 1000000);
46
47 return nsec;
48}
49
50static unsigned long nsec_low(unsigned long long nsec)
51{
52 if ((long long)nsec < 0)
53 nsec = -nsec;
54
55 return do_div(nsec, 1000000);
56}
57
58#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
59
60#ifdef CONFIG_FAIR_GROUP_SCHED
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{
63 struct sched_entity *se = tg->se[cpu];
64
65#define P(F) \
66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
67#define PN(F) \
68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
78 PN(se->exec_start);
79 PN(se->vruntime);
80 PN(se->sum_exec_runtime);
81#ifdef CONFIG_SCHEDSTATS
82 PN(se->statistics.wait_start);
83 PN(se->statistics.sleep_start);
84 PN(se->statistics.block_start);
85 PN(se->statistics.sleep_max);
86 PN(se->statistics.block_max);
87 PN(se->statistics.exec_max);
88 PN(se->statistics.slice_max);
89 PN(se->statistics.wait_max);
90 PN(se->statistics.wait_sum);
91 P(se->statistics.wait_count);
92#endif
93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
100#undef PN
101#undef P
102}
103#endif
104
105#ifdef CONFIG_CGROUP_SCHED
106static char group_path[PATH_MAX];
107
108static char *task_group_path(struct task_group *tg)
109{
110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path;
112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path;
122}
123#endif
124
125static void
126print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
127{
128 if (rq->curr == p)
129 SEQ_printf(m, "R");
130 else
131 SEQ_printf(m, " ");
132
133 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
134 p->comm, p->pid,
135 SPLIT_NS(p->se.vruntime),
136 (long long)(p->nvcsw + p->nivcsw),
137 p->prio);
138#ifdef CONFIG_SCHEDSTATS
139 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
140 SPLIT_NS(p->se.vruntime),
141 SPLIT_NS(p->se.sum_exec_runtime),
142 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
143#else
144 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
145 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
146#endif
147#ifdef CONFIG_CGROUP_SCHED
148 SEQ_printf(m, " %s", task_group_path(task_group(p)));
149#endif
150
151 SEQ_printf(m, "\n");
152}
153
154static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
155{
156 struct task_struct *g, *p;
157 unsigned long flags;
158
159 SEQ_printf(m,
160 "\nrunnable tasks:\n"
161 " task PID tree-key switches prio"
162 " exec-runtime sum-exec sum-sleep\n"
163 "------------------------------------------------------"
164 "----------------------------------------------------\n");
165
166 read_lock_irqsave(&tasklist_lock, flags);
167
168 do_each_thread(g, p) {
169 if (!p->on_rq || task_cpu(p) != rq_cpu)
170 continue;
171
172 print_task(m, rq, p);
173 } while_each_thread(g, p);
174
175 read_unlock_irqrestore(&tasklist_lock, flags);
176}
177
178void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179{
180 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
181 spread, rq0_min_vruntime, spread0;
182 struct rq *rq = cpu_rq(cpu);
183 struct sched_entity *last;
184 unsigned long flags;
185
186#ifdef CONFIG_FAIR_GROUP_SCHED
187 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
188#else
189 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
190#endif
191 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
192 SPLIT_NS(cfs_rq->exec_clock));
193
194 raw_spin_lock_irqsave(&rq->lock, flags);
195 if (cfs_rq->rb_leftmost)
196 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
197 last = __pick_last_entity(cfs_rq);
198 if (last)
199 max_vruntime = last->vruntime;
200 min_vruntime = cfs_rq->min_vruntime;
201 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
202 raw_spin_unlock_irqrestore(&rq->lock, flags);
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
204 SPLIT_NS(MIN_vruntime));
205 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
206 SPLIT_NS(min_vruntime));
207 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
208 SPLIT_NS(max_vruntime));
209 spread = max_vruntime - MIN_vruntime;
210 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
211 SPLIT_NS(spread));
212 spread0 = min_vruntime - rq0_min_vruntime;
213 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
214 SPLIT_NS(spread0));
215 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
216 cfs_rq->nr_spread_over);
217 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
219#ifdef CONFIG_FAIR_GROUP_SCHED
220#ifdef CONFIG_SMP
221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
222 cfs_rq->runnable_load_avg);
223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
224 cfs_rq->blocked_load_avg);
225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
226 atomic64_read(&cfs_rq->tg->load_avg));
227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
233#endif
234
235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
236#endif
237}
238
239void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
240{
241#ifdef CONFIG_RT_GROUP_SCHED
242 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
243#else
244 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
245#endif
246
247#define P(x) \
248 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
249#define PN(x) \
250 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
251
252 P(rt_nr_running);
253 P(rt_throttled);
254 PN(rt_time);
255 PN(rt_runtime);
256
257#undef PN
258#undef P
259}
260
261extern __read_mostly int sched_clock_running;
262
263static void print_cpu(struct seq_file *m, int cpu)
264{
265 struct rq *rq = cpu_rq(cpu);
266 unsigned long flags;
267
268#ifdef CONFIG_X86
269 {
270 unsigned int freq = cpu_khz ? : 1;
271
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000));
274 }
275#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu);
277#endif
278
279#define P(x) \
280do { \
281 if (sizeof(rq->x) == 4) \
282 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
283 else \
284 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
285} while (0)
286
287#define PN(x) \
288 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
289
290 P(nr_running);
291 SEQ_printf(m, " .%-30s: %lu\n", "load",
292 rq->load.weight);
293 P(nr_switches);
294 P(nr_load_updates);
295 P(nr_uninterruptible);
296 PN(next_balance);
297 P(curr->pid);
298 PN(clock);
299 P(cpu_load[0]);
300 P(cpu_load[1]);
301 P(cpu_load[2]);
302 P(cpu_load[3]);
303 P(cpu_load[4]);
304#undef P
305#undef PN
306
307#ifdef CONFIG_SCHEDSTATS
308#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
309#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
310
311 P(yld_count);
312
313 P(sched_count);
314 P(sched_goidle);
315#ifdef CONFIG_SMP
316 P64(avg_idle);
317#endif
318
319 P(ttwu_count);
320 P(ttwu_local);
321
322#undef P
323#undef P64
324#endif
325 spin_lock_irqsave(&sched_debug_lock, flags);
326 print_cfs_stats(m, cpu);
327 print_rt_stats(m, cpu);
328
329 rcu_read_lock();
330 print_rq(m, rq, cpu);
331 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags);
333}
334
335static const char *sched_tunable_scaling_names[] = {
336 "none",
337 "logaritmic",
338 "linear"
339};
340
341static int sched_debug_show(struct seq_file *m, void *v)
342{
343 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags;
345 int cpu;
346
347 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get());
349 sched_clk = sched_clock();
350 cpu_clk = local_clock();
351 local_irq_restore(flags);
352
353 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
354 init_utsname()->release,
355 (int)strcspn(init_utsname()->version, " "),
356 init_utsname()->version);
357
358#define P(x) \
359 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
360#define PN(x) \
361 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
362 PN(ktime);
363 PN(sched_clk);
364 PN(cpu_clk);
365 P(jiffies);
366#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
367 P(sched_clock_stable);
368#endif
369#undef PN
370#undef P
371
372 SEQ_printf(m, "\n");
373 SEQ_printf(m, "sysctl_sched\n");
374
375#define P(x) \
376 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
377#define PN(x) \
378 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
379 PN(sysctl_sched_latency);
380 PN(sysctl_sched_min_granularity);
381 PN(sysctl_sched_wakeup_granularity);
382 P(sysctl_sched_child_runs_first);
383 P(sysctl_sched_features);
384#undef PN
385#undef P
386
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
390
391 for_each_online_cpu(cpu)
392 print_cpu(m, cpu);
393
394 SEQ_printf(m, "\n");
395
396 return 0;
397}
398
399void sysrq_sched_debug_show(void)
400{
401 sched_debug_show(NULL, NULL);
402}
403
404static int sched_debug_open(struct inode *inode, struct file *filp)
405{
406 return single_open(filp, sched_debug_show, NULL);
407}
408
409static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open,
411 .read = seq_read,
412 .llseek = seq_lseek,
413 .release = single_release,
414};
415
416static int __init init_sched_debug_procfs(void)
417{
418 struct proc_dir_entry *pe;
419
420 pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
421 if (!pe)
422 return -ENOMEM;
423 return 0;
424}
425
426__initcall(init_sched_debug_procfs);
427
428void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
429{
430 unsigned long nr_switches;
431
432 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
433 get_nr_threads(p));
434 SEQ_printf(m,
435 "---------------------------------------------------------\n");
436#define __P(F) \
437 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
438#define P(F) \
439 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
440#define __PN(F) \
441 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
442#define PN(F) \
443 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
444
445 PN(se.exec_start);
446 PN(se.vruntime);
447 PN(se.sum_exec_runtime);
448
449 nr_switches = p->nvcsw + p->nivcsw;
450
451#ifdef CONFIG_SCHEDSTATS
452 PN(se.statistics.wait_start);
453 PN(se.statistics.sleep_start);
454 PN(se.statistics.block_start);
455 PN(se.statistics.sleep_max);
456 PN(se.statistics.block_max);
457 PN(se.statistics.exec_max);
458 PN(se.statistics.slice_max);
459 PN(se.statistics.wait_max);
460 PN(se.statistics.wait_sum);
461 P(se.statistics.wait_count);
462 PN(se.statistics.iowait_sum);
463 P(se.statistics.iowait_count);
464 P(se.nr_migrations);
465 P(se.statistics.nr_migrations_cold);
466 P(se.statistics.nr_failed_migrations_affine);
467 P(se.statistics.nr_failed_migrations_running);
468 P(se.statistics.nr_failed_migrations_hot);
469 P(se.statistics.nr_forced_migrations);
470 P(se.statistics.nr_wakeups);
471 P(se.statistics.nr_wakeups_sync);
472 P(se.statistics.nr_wakeups_migrate);
473 P(se.statistics.nr_wakeups_local);
474 P(se.statistics.nr_wakeups_remote);
475 P(se.statistics.nr_wakeups_affine);
476 P(se.statistics.nr_wakeups_affine_attempts);
477 P(se.statistics.nr_wakeups_passive);
478 P(se.statistics.nr_wakeups_idle);
479
480 {
481 u64 avg_atom, avg_per_cpu;
482
483 avg_atom = p->se.sum_exec_runtime;
484 if (nr_switches)
485 do_div(avg_atom, nr_switches);
486 else
487 avg_atom = -1LL;
488
489 avg_per_cpu = p->se.sum_exec_runtime;
490 if (p->se.nr_migrations) {
491 avg_per_cpu = div64_u64(avg_per_cpu,
492 p->se.nr_migrations);
493 } else {
494 avg_per_cpu = -1LL;
495 }
496
497 __PN(avg_atom);
498 __PN(avg_per_cpu);
499 }
500#endif
501 __P(nr_switches);
502 SEQ_printf(m, "%-35s:%21Ld\n",
503 "nr_voluntary_switches", (long long)p->nvcsw);
504 SEQ_printf(m, "%-35s:%21Ld\n",
505 "nr_involuntary_switches", (long long)p->nivcsw);
506
507 P(se.load.weight);
508 P(policy);
509 P(prio);
510#undef PN
511#undef __PN
512#undef P
513#undef __P
514
515 {
516 unsigned int this_cpu = raw_smp_processor_id();
517 u64 t0, t1;
518
519 t0 = cpu_clock(this_cpu);
520 t1 = cpu_clock(this_cpu);
521 SEQ_printf(m, "%-35s:%21Ld\n",
522 "clock-delta", (long long)(t1-t0));
523 }
524}
525
526void proc_sched_set_task(struct task_struct *p)
527{
528#ifdef CONFIG_SCHEDSTATS
529 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
530#endif
531}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
deleted file mode 100644
index 5eea8707234..00000000000
--- a/kernel/sched/fair.c
+++ /dev/null
@@ -1,6174 +0,0 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
36
37/*
38 * Targeted preemption latency for CPU-bound tasks:
39 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40 *
41 * NOTE: this latency value is not the same as the concept of
42 * 'timeslice length' - timeslices in CFS are of variable length
43 * and have no persistent notion like in traditional, time-slice
44 * based scheduling concepts.
45 *
46 * (to see the precise effective timeslice length of your workload,
47 * run vmstat and monitor the context-switches (cs) field)
48 */
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51
52/*
53 * The initial- and re-scaling of tunables is configurable
54 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55 *
56 * Options are:
57 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60 */
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
64/*
65 * Minimal preemption granularity for CPU-bound tasks:
66 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 */
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70
71/*
72 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73 */
74static unsigned int sched_nr_latency = 8;
75
76/*
77 * After fork, child runs first. If set to 0 (default) then
78 * parent will (try to) run first.
79 */
80unsigned int sysctl_sched_child_runs_first __read_mostly;
81
82/*
83 * SCHED_OTHER wake-up granularity.
84 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85 *
86 * This option delays the preemption effects of decoupled workloads
87 * and reduces their over-scheduling. Synchronous workloads will still
88 * have immediate wakeup/sleep latencies.
89 */
90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
95/*
96 * The exponential sliding window over which load is averaged for shares
97 * distribution.
98 * (default: 10msec)
99 */
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
102#ifdef CONFIG_CFS_BANDWIDTH
103/*
104 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105 * each time a cfs_rq requests quota.
106 *
107 * Note: in the case that the slice exceeds the runtime remaining (either due
108 * to consumption or the quota being specified to be smaller than the slice)
109 * we will always only issue the remaining available time.
110 *
111 * default: 5 msec, units: microseconds
112 */
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
116/*
117 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible
119 * to users decreases. But the relationship is not linear,
120 * so pick a second-best guess by going with the log2 of the
121 * number of CPUs.
122 *
123 * This idea comes from the SD scheduler of Con Kolivas:
124 */
125static int get_update_sysctl_factor(void)
126{
127 unsigned int cpus = min_t(int, num_online_cpus(), 8);
128 unsigned int factor;
129
130 switch (sysctl_sched_tunable_scaling) {
131 case SCHED_TUNABLESCALING_NONE:
132 factor = 1;
133 break;
134 case SCHED_TUNABLESCALING_LINEAR:
135 factor = cpus;
136 break;
137 case SCHED_TUNABLESCALING_LOG:
138 default:
139 factor = 1 + ilog2(cpus);
140 break;
141 }
142
143 return factor;
144}
145
146static void update_sysctl(void)
147{
148 unsigned int factor = get_update_sysctl_factor();
149
150#define SET_SYSCTL(name) \
151 (sysctl_##name = (factor) * normalized_sysctl_##name)
152 SET_SYSCTL(sched_min_granularity);
153 SET_SYSCTL(sched_latency);
154 SET_SYSCTL(sched_wakeup_granularity);
155#undef SET_SYSCTL
156}
157
158void sched_init_granularity(void)
159{
160 update_sysctl();
161}
162
163#if BITS_PER_LONG == 32
164# define WMULT_CONST (~0UL)
165#else
166# define WMULT_CONST (1UL << 32)
167#endif
168
169#define WMULT_SHIFT 32
170
171/*
172 * Shift right and round:
173 */
174#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
175
176/*
177 * delta *= weight / lw
178 */
179static unsigned long
180calc_delta_mine(unsigned long delta_exec, unsigned long weight,
181 struct load_weight *lw)
182{
183 u64 tmp;
184
185 /*
186 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
187 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
188 * 2^SCHED_LOAD_RESOLUTION.
189 */
190 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
191 tmp = (u64)delta_exec * scale_load_down(weight);
192 else
193 tmp = (u64)delta_exec;
194
195 if (!lw->inv_weight) {
196 unsigned long w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204 }
205
206 /*
207 * Check whether we'd overflow the 64-bit multiplication:
208 */
209 if (unlikely(tmp > WMULT_CONST))
210 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
211 WMULT_SHIFT/2);
212 else
213 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
214
215 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
216}
217
218
219const struct sched_class fair_sched_class;
220
221/**************************************************************
222 * CFS operations on generic schedulable entities:
223 */
224
225#ifdef CONFIG_FAIR_GROUP_SCHED
226
227/* cpu runqueue to which this cfs_rq is attached */
228static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
229{
230 return cfs_rq->rq;
231}
232
233/* An entity is a task if it doesn't "own" a runqueue */
234#define entity_is_task(se) (!se->my_q)
235
236static inline struct task_struct *task_of(struct sched_entity *se)
237{
238#ifdef CONFIG_SCHED_DEBUG
239 WARN_ON_ONCE(!entity_is_task(se));
240#endif
241 return container_of(se, struct task_struct, se);
242}
243
244/* Walk up scheduling entities hierarchy */
245#define for_each_sched_entity(se) \
246 for (; se; se = se->parent)
247
248static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
249{
250 return p->se.cfs_rq;
251}
252
253/* runqueue on which this entity is (to be) queued */
254static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
255{
256 return se->cfs_rq;
257}
258
259/* runqueue "owned" by this group */
260static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
261{
262 return grp->my_q;
263}
264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
269{
270 if (!cfs_rq->on_list) {
271 /*
272 * Ensure we either appear before our parent (if already
273 * enqueued) or force our parent to appear after us when it is
274 * enqueued. The fact that we always enqueue bottom-up
275 * reduces this to two cases.
276 */
277 if (cfs_rq->tg->parent &&
278 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
279 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
280 &rq_of(cfs_rq)->leaf_cfs_rq_list);
281 } else {
282 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
283 &rq_of(cfs_rq)->leaf_cfs_rq_list);
284 }
285
286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
289 }
290}
291
292static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293{
294 if (cfs_rq->on_list) {
295 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
296 cfs_rq->on_list = 0;
297 }
298}
299
300/* Iterate thr' all leaf cfs_rq's on a runqueue */
301#define for_each_leaf_cfs_rq(rq, cfs_rq) \
302 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
303
304/* Do the two (enqueued) entities belong to the same group ? */
305static inline int
306is_same_group(struct sched_entity *se, struct sched_entity *pse)
307{
308 if (se->cfs_rq == pse->cfs_rq)
309 return 1;
310
311 return 0;
312}
313
314static inline struct sched_entity *parent_entity(struct sched_entity *se)
315{
316 return se->parent;
317}
318
319/* return depth at which a sched entity is present in the hierarchy */
320static inline int depth_se(struct sched_entity *se)
321{
322 int depth = 0;
323
324 for_each_sched_entity(se)
325 depth++;
326
327 return depth;
328}
329
330static void
331find_matching_se(struct sched_entity **se, struct sched_entity **pse)
332{
333 int se_depth, pse_depth;
334
335 /*
336 * preemption test can be made between sibling entities who are in the
337 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
338 * both tasks until we find their ancestors who are siblings of common
339 * parent.
340 */
341
342 /* First walk up until both entities are at same depth */
343 se_depth = depth_se(*se);
344 pse_depth = depth_se(*pse);
345
346 while (se_depth > pse_depth) {
347 se_depth--;
348 *se = parent_entity(*se);
349 }
350
351 while (pse_depth > se_depth) {
352 pse_depth--;
353 *pse = parent_entity(*pse);
354 }
355
356 while (!is_same_group(*se, *pse)) {
357 *se = parent_entity(*se);
358 *pse = parent_entity(*pse);
359 }
360}
361
362#else /* !CONFIG_FAIR_GROUP_SCHED */
363
364static inline struct task_struct *task_of(struct sched_entity *se)
365{
366 return container_of(se, struct task_struct, se);
367}
368
369static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
370{
371 return container_of(cfs_rq, struct rq, cfs);
372}
373
374#define entity_is_task(se) 1
375
376#define for_each_sched_entity(se) \
377 for (; se; se = NULL)
378
379static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
380{
381 return &task_rq(p)->cfs;
382}
383
384static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
385{
386 struct task_struct *p = task_of(se);
387 struct rq *rq = task_rq(p);
388
389 return &rq->cfs;
390}
391
392/* runqueue "owned" by this group */
393static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
394{
395 return NULL;
396}
397
398static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
399{
400}
401
402static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403{
404}
405
406#define for_each_leaf_cfs_rq(rq, cfs_rq) \
407 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
408
409static inline int
410is_same_group(struct sched_entity *se, struct sched_entity *pse)
411{
412 return 1;
413}
414
415static inline struct sched_entity *parent_entity(struct sched_entity *se)
416{
417 return NULL;
418}
419
420static inline void
421find_matching_se(struct sched_entity **se, struct sched_entity **pse)
422{
423}
424
425#endif /* CONFIG_FAIR_GROUP_SCHED */
426
427static __always_inline
428void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
429
430/**************************************************************
431 * Scheduling class tree data structure manipulation methods:
432 */
433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
435{
436 s64 delta = (s64)(vruntime - min_vruntime);
437 if (delta > 0)
438 min_vruntime = vruntime;
439
440 return min_vruntime;
441}
442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
444{
445 s64 delta = (s64)(vruntime - min_vruntime);
446 if (delta < 0)
447 min_vruntime = vruntime;
448
449 return min_vruntime;
450}
451
452static inline int entity_before(struct sched_entity *a,
453 struct sched_entity *b)
454{
455 return (s64)(a->vruntime - b->vruntime) < 0;
456}
457
458static void update_min_vruntime(struct cfs_rq *cfs_rq)
459{
460 u64 vruntime = cfs_rq->min_vruntime;
461
462 if (cfs_rq->curr)
463 vruntime = cfs_rq->curr->vruntime;
464
465 if (cfs_rq->rb_leftmost) {
466 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
467 struct sched_entity,
468 run_node);
469
470 if (!cfs_rq->curr)
471 vruntime = se->vruntime;
472 else
473 vruntime = min_vruntime(vruntime, se->vruntime);
474 }
475
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT
478 smp_wmb();
479 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480#endif
481}
482
483/*
484 * Enqueue an entity into the rb-tree:
485 */
486static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487{
488 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 struct rb_node *parent = NULL;
490 struct sched_entity *entry;
491 int leftmost = 1;
492
493 /*
494 * Find the right place in the rbtree:
495 */
496 while (*link) {
497 parent = *link;
498 entry = rb_entry(parent, struct sched_entity, run_node);
499 /*
500 * We dont care about collisions. Nodes with
501 * the same key stay together.
502 */
503 if (entity_before(se, entry)) {
504 link = &parent->rb_left;
505 } else {
506 link = &parent->rb_right;
507 leftmost = 0;
508 }
509 }
510
511 /*
512 * Maintain a cache of leftmost tree entries (it is frequently
513 * used):
514 */
515 if (leftmost)
516 cfs_rq->rb_leftmost = &se->run_node;
517
518 rb_link_node(&se->run_node, parent, link);
519 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520}
521
522static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{
524 if (cfs_rq->rb_leftmost == &se->run_node) {
525 struct rb_node *next_node;
526
527 next_node = rb_next(&se->run_node);
528 cfs_rq->rb_leftmost = next_node;
529 }
530
531 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532}
533
534struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535{
536 struct rb_node *left = cfs_rq->rb_leftmost;
537
538 if (!left)
539 return NULL;
540
541 return rb_entry(left, struct sched_entity, run_node);
542}
543
544static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545{
546 struct rb_node *next = rb_next(&se->run_node);
547
548 if (!next)
549 return NULL;
550
551 return rb_entry(next, struct sched_entity, run_node);
552}
553
554#ifdef CONFIG_SCHED_DEBUG
555struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556{
557 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558
559 if (!last)
560 return NULL;
561
562 return rb_entry(last, struct sched_entity, run_node);
563}
564
565/**************************************************************
566 * Scheduling class statistics methods:
567 */
568
569int sched_proc_update_handler(struct ctl_table *table, int write,
570 void __user *buffer, size_t *lenp,
571 loff_t *ppos)
572{
573 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 int factor = get_update_sysctl_factor();
575
576 if (ret || !write)
577 return ret;
578
579 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 sysctl_sched_min_granularity);
581
582#define WRT_SYSCTL(name) \
583 (normalized_sysctl_##name = sysctl_##name / (factor))
584 WRT_SYSCTL(sched_min_granularity);
585 WRT_SYSCTL(sched_latency);
586 WRT_SYSCTL(sched_wakeup_granularity);
587#undef WRT_SYSCTL
588
589 return 0;
590}
591#endif
592
593/*
594 * delta /= w
595 */
596static inline unsigned long
597calc_delta_fair(unsigned long delta, struct sched_entity *se)
598{
599 if (unlikely(se->load.weight != NICE_0_LOAD))
600 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
601
602 return delta;
603}
604
605/*
606 * The idea is to set a period in which each task runs once.
607 *
608 * When there are too many tasks (sched_nr_latency) we have to stretch
609 * this period because otherwise the slices get too small.
610 *
611 * p = (nr <= nl) ? l : l*nr/nl
612 */
613static u64 __sched_period(unsigned long nr_running)
614{
615 u64 period = sysctl_sched_latency;
616 unsigned long nr_latency = sched_nr_latency;
617
618 if (unlikely(nr_running > nr_latency)) {
619 period = sysctl_sched_min_granularity;
620 period *= nr_running;
621 }
622
623 return period;
624}
625
626/*
627 * We calculate the wall-time slice from the period by taking a part
628 * proportional to the weight.
629 *
630 * s = p*P[w/rw]
631 */
632static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
633{
634 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
635
636 for_each_sched_entity(se) {
637 struct load_weight *load;
638 struct load_weight lw;
639
640 cfs_rq = cfs_rq_of(se);
641 load = &cfs_rq->load;
642
643 if (unlikely(!se->on_rq)) {
644 lw = cfs_rq->load;
645
646 update_load_add(&lw, se->load.weight);
647 load = &lw;
648 }
649 slice = calc_delta_mine(slice, se->load.weight, load);
650 }
651 return slice;
652}
653
654/*
655 * We calculate the vruntime slice of a to be inserted task
656 *
657 * vs = s/w
658 */
659static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
660{
661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
662}
663
664/*
665 * Update the current task's runtime statistics. Skip current tasks that
666 * are not in our scheduling class.
667 */
668static inline void
669__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
670 unsigned long delta_exec)
671{
672 unsigned long delta_exec_weighted;
673
674 schedstat_set(curr->statistics.exec_max,
675 max((u64)delta_exec, curr->statistics.exec_max));
676
677 curr->sum_exec_runtime += delta_exec;
678 schedstat_add(cfs_rq, exec_clock, delta_exec);
679 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
680
681 curr->vruntime += delta_exec_weighted;
682 update_min_vruntime(cfs_rq);
683}
684
685static void update_curr(struct cfs_rq *cfs_rq)
686{
687 struct sched_entity *curr = cfs_rq->curr;
688 u64 now = rq_of(cfs_rq)->clock_task;
689 unsigned long delta_exec;
690
691 if (unlikely(!curr))
692 return;
693
694 /*
695 * Get the amount of time the current task was running
696 * since the last time we changed load (this cannot
697 * overflow on 32 bits):
698 */
699 delta_exec = (unsigned long)(now - curr->exec_start);
700 if (!delta_exec)
701 return;
702
703 __update_curr(cfs_rq, curr, delta_exec);
704 curr->exec_start = now;
705
706 if (entity_is_task(curr)) {
707 struct task_struct *curtask = task_of(curr);
708
709 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
710 cpuacct_charge(curtask, delta_exec);
711 account_group_exec_runtime(curtask, delta_exec);
712 }
713
714 account_cfs_rq_runtime(cfs_rq, delta_exec);
715}
716
717static inline void
718update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
719{
720 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
721}
722
723/*
724 * Task is being enqueued - update stats:
725 */
726static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
727{
728 /*
729 * Are we enqueueing a waiting task? (for current tasks
730 * a dequeue/enqueue event is a NOP)
731 */
732 if (se != cfs_rq->curr)
733 update_stats_wait_start(cfs_rq, se);
734}
735
736static void
737update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
738{
739 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
740 rq_of(cfs_rq)->clock - se->statistics.wait_start));
741 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
742 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
743 rq_of(cfs_rq)->clock - se->statistics.wait_start);
744#ifdef CONFIG_SCHEDSTATS
745 if (entity_is_task(se)) {
746 trace_sched_stat_wait(task_of(se),
747 rq_of(cfs_rq)->clock - se->statistics.wait_start);
748 }
749#endif
750 schedstat_set(se->statistics.wait_start, 0);
751}
752
753static inline void
754update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
755{
756 /*
757 * Mark the end of the wait period if dequeueing a
758 * waiting task:
759 */
760 if (se != cfs_rq->curr)
761 update_stats_wait_end(cfs_rq, se);
762}
763
764/*
765 * We are picking a new current task - update its stats:
766 */
767static inline void
768update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
769{
770 /*
771 * We are starting a new run period:
772 */
773 se->exec_start = rq_of(cfs_rq)->clock_task;
774}
775
776/**************************************************
777 * Scheduling class queueing methods:
778 */
779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq;
797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 if (p->numa_scan_seq == seq)
802 return;
803 p->numa_scan_seq = seq;
804
805 /* FIXME: Scheduling placement policy hints go here */
806}
807
808/*
809 * Got a PROT_NONE fault for a page on @node.
810 */
811void task_numa_fault(int node, int pages, bool migrated)
812{
813 struct task_struct *p = current;
814
815 if (!sched_feat_numa(NUMA))
816 return;
817
818 /* FIXME: Allocate task-specific structure for placement policy here */
819
820 /*
821 * If pages are properly placed (did not migrate) then scan slower.
822 * This is reset periodically in case of phase changes
823 */
824 if (!migrated)
825 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 p->numa_scan_period + jiffies_to_msecs(10));
827
828 task_numa_placement(p);
829}
830
831static void reset_ptenuma_scan(struct task_struct *p)
832{
833 ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 p->mm->numa_scan_offset = 0;
835}
836
837/*
838 * The expensive part of numa migration is done from task_work context.
839 * Triggered from task_tick_numa().
840 */
841void task_numa_work(struct callback_head *work)
842{
843 unsigned long migrate, next_scan, now = jiffies;
844 struct task_struct *p = current;
845 struct mm_struct *mm = p->mm;
846 struct vm_area_struct *vma;
847 unsigned long start, end;
848 long pages;
849
850 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851
852 work->next = work; /* protect against double add */
853 /*
854 * Who cares about NUMA placement when they're dying.
855 *
856 * NOTE: make sure not to dereference p->mm before this check,
857 * exit_task_work() happens _after_ exit_mm() so we could be called
858 * without p->mm even though we still had it when we enqueued this
859 * work.
860 */
861 if (p->flags & PF_EXITING)
862 return;
863
864 /*
865 * We do not care about task placement until a task runs on a node
866 * other than the first one used by the address space. This is
867 * largely because migrations are driven by what CPU the task
868 * is running on. If it's never scheduled on another node, it'll
869 * not migrate so why bother trapping the fault.
870 */
871 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 mm->first_nid = numa_node_id();
873 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 /* Are we running on a new node yet? */
875 if (numa_node_id() == mm->first_nid &&
876 !sched_feat_numa(NUMA_FORCE))
877 return;
878
879 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 }
881
882 /*
883 * Reset the scan period if enough time has gone by. Objective is that
884 * scanning will be reduced if pages are properly placed. As tasks
885 * can enter different phases this needs to be re-examined. Lacking
886 * proper tracking of reference behaviour, this blunt hammer is used.
887 */
888 migrate = mm->numa_next_reset;
889 if (time_after(now, migrate)) {
890 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 xchg(&mm->numa_next_reset, next_scan);
893 }
894
895 /*
896 * Enforce maximal scan/migration frequency..
897 */
898 migrate = mm->numa_next_scan;
899 if (time_before(now, migrate))
900 return;
901
902 if (p->numa_scan_period == 0)
903 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904
905 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 return;
908
909 /*
910 * Do not set pte_numa if the current running node is rate-limited.
911 * This loses statistics on the fault but if we are unwilling to
912 * migrate to this node, it is less likely we can do useful work
913 */
914 if (migrate_ratelimited(numa_node_id()))
915 return;
916
917 start = mm->numa_scan_offset;
918 pages = sysctl_numa_balancing_scan_size;
919 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 if (!pages)
921 return;
922
923 down_read(&mm->mmap_sem);
924 vma = find_vma(mm, start);
925 if (!vma) {
926 reset_ptenuma_scan(p);
927 start = 0;
928 vma = mm->mmap;
929 }
930 for (; vma; vma = vma->vm_next) {
931 if (!vma_migratable(vma))
932 continue;
933
934 /* Skip small VMAs. They are not likely to be of relevance */
935 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 continue;
937
938 do {
939 start = max(start, vma->vm_start);
940 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 end = min(end, vma->vm_end);
942 pages -= change_prot_numa(vma, start, end);
943
944 start = end;
945 if (pages <= 0)
946 goto out;
947 } while (end != vma->vm_end);
948 }
949
950out:
951 /*
952 * It is possible to reach the end of the VMA list but the last few VMAs are
953 * not guaranteed to the vma_migratable. If they are not, we would find the
954 * !migratable VMA on the next scan but not reset the scanner to the start
955 * so check it now.
956 */
957 if (vma)
958 mm->numa_scan_offset = start;
959 else
960 reset_ptenuma_scan(p);
961 up_read(&mm->mmap_sem);
962}
963
964/*
965 * Drive the periodic memory faults..
966 */
967void task_tick_numa(struct rq *rq, struct task_struct *curr)
968{
969 struct callback_head *work = &curr->numa_work;
970 u64 period, now;
971
972 /*
973 * We don't care about NUMA placement if we don't have memory.
974 */
975 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 return;
977
978 /*
979 * Using runtime rather than walltime has the dual advantage that
980 * we (mostly) drive the selection from busy threads and that the
981 * task needs to have done some actual work before we bother with
982 * NUMA placement.
983 */
984 now = curr->se.sum_exec_runtime;
985 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986
987 if (now - curr->node_stamp > period) {
988 if (!curr->node_stamp)
989 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 curr->node_stamp = now;
991
992 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 task_work_add(curr, work, true);
995 }
996 }
997}
998#else
999static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000{
1001}
1002#endif /* CONFIG_NUMA_BALANCING */
1003
1004static void
1005account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1006{
1007 update_load_add(&cfs_rq->load, se->load.weight);
1008 if (!parent_entity(se))
1009 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1010#ifdef CONFIG_SMP
1011 if (entity_is_task(se))
1012 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1013#endif
1014 cfs_rq->nr_running++;
1015}
1016
1017static void
1018account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1019{
1020 update_load_sub(&cfs_rq->load, se->load.weight);
1021 if (!parent_entity(se))
1022 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1023 if (entity_is_task(se))
1024 list_del_init(&se->group_node);
1025 cfs_rq->nr_running--;
1026}
1027
1028#ifdef CONFIG_FAIR_GROUP_SCHED
1029# ifdef CONFIG_SMP
1030static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1031{
1032 long tg_weight;
1033
1034 /*
1035 * Use this CPU's actual weight instead of the last load_contribution
1036 * to gain a more accurate current total weight. See
1037 * update_cfs_rq_load_contribution().
1038 */
1039 tg_weight = atomic64_read(&tg->load_avg);
1040 tg_weight -= cfs_rq->tg_load_contrib;
1041 tg_weight += cfs_rq->load.weight;
1042
1043 return tg_weight;
1044}
1045
1046static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1047{
1048 long tg_weight, load, shares;
1049
1050 tg_weight = calc_tg_weight(tg, cfs_rq);
1051 load = cfs_rq->load.weight;
1052
1053 shares = (tg->shares * load);
1054 if (tg_weight)
1055 shares /= tg_weight;
1056
1057 if (shares < MIN_SHARES)
1058 shares = MIN_SHARES;
1059 if (shares > tg->shares)
1060 shares = tg->shares;
1061
1062 return shares;
1063}
1064# else /* CONFIG_SMP */
1065static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1066{
1067 return tg->shares;
1068}
1069# endif /* CONFIG_SMP */
1070static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1071 unsigned long weight)
1072{
1073 if (se->on_rq) {
1074 /* commit outstanding execution time */
1075 if (cfs_rq->curr == se)
1076 update_curr(cfs_rq);
1077 account_entity_dequeue(cfs_rq, se);
1078 }
1079
1080 update_load_set(&se->load, weight);
1081
1082 if (se->on_rq)
1083 account_entity_enqueue(cfs_rq, se);
1084}
1085
1086static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087
1088static void update_cfs_shares(struct cfs_rq *cfs_rq)
1089{
1090 struct task_group *tg;
1091 struct sched_entity *se;
1092 long shares;
1093
1094 tg = cfs_rq->tg;
1095 se = tg->se[cpu_of(rq_of(cfs_rq))];
1096 if (!se || throttled_hierarchy(cfs_rq))
1097 return;
1098#ifndef CONFIG_SMP
1099 if (likely(se->load.weight == tg->shares))
1100 return;
1101#endif
1102 shares = calc_cfs_shares(cfs_rq, tg);
1103
1104 reweight_entity(cfs_rq_of(se), se, shares);
1105}
1106#else /* CONFIG_FAIR_GROUP_SCHED */
1107static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1108{
1109}
1110#endif /* CONFIG_FAIR_GROUP_SCHED */
1111
1112/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114/*
1115 * We choose a half-life close to 1 scheduling period.
1116 * Note: The tables below are dependent on this value.
1117 */
1118#define LOAD_AVG_PERIOD 32
1119#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121
1122/* Precomputed fixed inverse multiplies for multiplication by y^n */
1123static const u32 runnable_avg_yN_inv[] = {
1124 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 0x85aac367, 0x82cd8698,
1130};
1131
1132/*
1133 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1134 * over-estimates when re-combining.
1135 */
1136static const u32 runnable_avg_yN_sum[] = {
1137 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140};
1141
1142/*
1143 * Approximate:
1144 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1145 */
1146static __always_inline u64 decay_load(u64 val, u64 n)
1147{
1148 unsigned int local_n;
1149
1150 if (!n)
1151 return val;
1152 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 return 0;
1154
1155 /* after bounds checking we can collapse to 32-bit */
1156 local_n = n;
1157
1158 /*
1159 * As y^PERIOD = 1/2, we can combine
1160 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 * With a look-up table which covers k^n (n<PERIOD)
1162 *
1163 * To achieve constant time decay_load.
1164 */
1165 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 val >>= local_n / LOAD_AVG_PERIOD;
1167 local_n %= LOAD_AVG_PERIOD;
1168 }
1169
1170 val *= runnable_avg_yN_inv[local_n];
1171 /* We don't use SRR here since we always want to round down. */
1172 return val >> 32;
1173}
1174
1175/*
1176 * For updates fully spanning n periods, the contribution to runnable
1177 * average will be: \Sum 1024*y^n
1178 *
1179 * We can compute this reasonably efficiently by combining:
1180 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1181 */
1182static u32 __compute_runnable_contrib(u64 n)
1183{
1184 u32 contrib = 0;
1185
1186 if (likely(n <= LOAD_AVG_PERIOD))
1187 return runnable_avg_yN_sum[n];
1188 else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 return LOAD_AVG_MAX;
1190
1191 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 do {
1193 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195
1196 n -= LOAD_AVG_PERIOD;
1197 } while (n > LOAD_AVG_PERIOD);
1198
1199 contrib = decay_load(contrib, n);
1200 return contrib + runnable_avg_yN_sum[n];
1201}
1202
1203/*
1204 * We can represent the historical contribution to runnable average as the
1205 * coefficients of a geometric series. To do this we sub-divide our runnable
1206 * history into segments of approximately 1ms (1024us); label the segment that
1207 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208 *
1209 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210 * p0 p1 p2
1211 * (now) (~1ms ago) (~2ms ago)
1212 *
1213 * Let u_i denote the fraction of p_i that the entity was runnable.
1214 *
1215 * We then designate the fractions u_i as our co-efficients, yielding the
1216 * following representation of historical load:
1217 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218 *
1219 * We choose y based on the with of a reasonably scheduling period, fixing:
1220 * y^32 = 0.5
1221 *
1222 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223 * approximately half as much as the contribution to load within the last ms
1224 * (u_0).
1225 *
1226 * When a period "rolls over" and we have new u_0`, multiplying the previous
1227 * sum again by y is sufficient to update:
1228 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230 */
1231static __always_inline int __update_entity_runnable_avg(u64 now,
1232 struct sched_avg *sa,
1233 int runnable)
1234{
1235 u64 delta, periods;
1236 u32 runnable_contrib;
1237 int delta_w, decayed = 0;
1238
1239 delta = now - sa->last_runnable_update;
1240 /*
1241 * This should only happen when time goes backwards, which it
1242 * unfortunately does during sched clock init when we swap over to TSC.
1243 */
1244 if ((s64)delta < 0) {
1245 sa->last_runnable_update = now;
1246 return 0;
1247 }
1248
1249 /*
1250 * Use 1024ns as the unit of measurement since it's a reasonable
1251 * approximation of 1us and fast to compute.
1252 */
1253 delta >>= 10;
1254 if (!delta)
1255 return 0;
1256 sa->last_runnable_update = now;
1257
1258 /* delta_w is the amount already accumulated against our next period */
1259 delta_w = sa->runnable_avg_period % 1024;
1260 if (delta + delta_w >= 1024) {
1261 /* period roll-over */
1262 decayed = 1;
1263
1264 /*
1265 * Now that we know we're crossing a period boundary, figure
1266 * out how much from delta we need to complete the current
1267 * period and accrue it.
1268 */
1269 delta_w = 1024 - delta_w;
1270 if (runnable)
1271 sa->runnable_avg_sum += delta_w;
1272 sa->runnable_avg_period += delta_w;
1273
1274 delta -= delta_w;
1275
1276 /* Figure out how many additional periods this update spans */
1277 periods = delta / 1024;
1278 delta %= 1024;
1279
1280 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 periods + 1);
1282 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 periods + 1);
1284
1285 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 runnable_contrib = __compute_runnable_contrib(periods);
1287 if (runnable)
1288 sa->runnable_avg_sum += runnable_contrib;
1289 sa->runnable_avg_period += runnable_contrib;
1290 }
1291
1292 /* Remainder of delta accrued against u_0` */
1293 if (runnable)
1294 sa->runnable_avg_sum += delta;
1295 sa->runnable_avg_period += delta;
1296
1297 return decayed;
1298}
1299
1300/* Synchronize an entity's decay with its parenting cfs_rq.*/
1301static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1302{
1303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305
1306 decays -= se->avg.decay_count;
1307 if (!decays)
1308 return 0;
1309
1310 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 se->avg.decay_count = 0;
1312
1313 return decays;
1314}
1315
1316#ifdef CONFIG_FAIR_GROUP_SCHED
1317static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 int force_update)
1319{
1320 struct task_group *tg = cfs_rq->tg;
1321 s64 tg_contrib;
1322
1323 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 tg_contrib -= cfs_rq->tg_load_contrib;
1325
1326 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 atomic64_add(tg_contrib, &tg->load_avg);
1328 cfs_rq->tg_load_contrib += tg_contrib;
1329 }
1330}
1331
1332/*
1333 * Aggregate cfs_rq runnable averages into an equivalent task_group
1334 * representation for computing load contributions.
1335 */
1336static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 struct cfs_rq *cfs_rq)
1338{
1339 struct task_group *tg = cfs_rq->tg;
1340 long contrib;
1341
1342 /* The fraction of a cpu used by this cfs_rq */
1343 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 sa->runnable_avg_period + 1);
1345 contrib -= cfs_rq->tg_runnable_contrib;
1346
1347 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 atomic_add(contrib, &tg->runnable_avg);
1349 cfs_rq->tg_runnable_contrib += contrib;
1350 }
1351}
1352
1353static inline void __update_group_entity_contrib(struct sched_entity *se)
1354{
1355 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 struct task_group *tg = cfs_rq->tg;
1357 int runnable_avg;
1358
1359 u64 contrib;
1360
1361 contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 se->avg.load_avg_contrib = div64_u64(contrib,
1363 atomic64_read(&tg->load_avg) + 1);
1364
1365 /*
1366 * For group entities we need to compute a correction term in the case
1367 * that they are consuming <1 cpu so that we would contribute the same
1368 * load as a task of equal weight.
1369 *
1370 * Explicitly co-ordinating this measurement would be expensive, but
1371 * fortunately the sum of each cpus contribution forms a usable
1372 * lower-bound on the true value.
1373 *
1374 * Consider the aggregate of 2 contributions. Either they are disjoint
1375 * (and the sum represents true value) or they are disjoint and we are
1376 * understating by the aggregate of their overlap.
1377 *
1378 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 * cpus that overlap for this interval and w_i is the interval width.
1381 *
1382 * On a small machine; the first term is well-bounded which bounds the
1383 * total error since w_i is a subset of the period. Whereas on a
1384 * larger machine, while this first term can be larger, if w_i is the
1385 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 * our upper bound of 1-cpu.
1387 */
1388 runnable_avg = atomic_read(&tg->runnable_avg);
1389 if (runnable_avg < NICE_0_LOAD) {
1390 se->avg.load_avg_contrib *= runnable_avg;
1391 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 }
1393}
1394#else
1395static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 int force_update) {}
1397static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 struct cfs_rq *cfs_rq) {}
1399static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400#endif
1401
1402static inline void __update_task_entity_contrib(struct sched_entity *se)
1403{
1404 u32 contrib;
1405
1406 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 contrib /= (se->avg.runnable_avg_period + 1);
1409 se->avg.load_avg_contrib = scale_load(contrib);
1410}
1411
1412/* Compute the current contribution to load_avg by se, return any delta */
1413static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414{
1415 long old_contrib = se->avg.load_avg_contrib;
1416
1417 if (entity_is_task(se)) {
1418 __update_task_entity_contrib(se);
1419 } else {
1420 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 __update_group_entity_contrib(se);
1422 }
1423
1424 return se->avg.load_avg_contrib - old_contrib;
1425}
1426
1427static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 long load_contrib)
1429{
1430 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 cfs_rq->blocked_load_avg -= load_contrib;
1432 else
1433 cfs_rq->blocked_load_avg = 0;
1434}
1435
1436static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437
1438/* Update a sched_entity's runnable average */
1439static inline void update_entity_load_avg(struct sched_entity *se,
1440 int update_cfs_rq)
1441{
1442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 long contrib_delta;
1444 u64 now;
1445
1446 /*
1447 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 * case they are the parent of a throttled hierarchy.
1449 */
1450 if (entity_is_task(se))
1451 now = cfs_rq_clock_task(cfs_rq);
1452 else
1453 now = cfs_rq_clock_task(group_cfs_rq(se));
1454
1455 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 return;
1457
1458 contrib_delta = __update_entity_load_avg_contrib(se);
1459
1460 if (!update_cfs_rq)
1461 return;
1462
1463 if (se->on_rq)
1464 cfs_rq->runnable_load_avg += contrib_delta;
1465 else
1466 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467}
1468
1469/*
1470 * Decay the load contributed by all blocked children and account this so that
1471 * their contribution may appropriately discounted when they wake up.
1472 */
1473static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474{
1475 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 u64 decays;
1477
1478 decays = now - cfs_rq->last_decay;
1479 if (!decays && !force_update)
1480 return;
1481
1482 if (atomic64_read(&cfs_rq->removed_load)) {
1483 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 }
1486
1487 if (decays) {
1488 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 decays);
1490 atomic64_add(decays, &cfs_rq->decay_counter);
1491 cfs_rq->last_decay = now;
1492 }
1493
1494 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495}
1496
1497static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498{
1499 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501}
1502
1503/* Add the load generated by se into cfs_rq's child load-average */
1504static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 struct sched_entity *se,
1506 int wakeup)
1507{
1508 /*
1509 * We track migrations using entity decay_count <= 0, on a wake-up
1510 * migration we use a negative decay count to track the remote decays
1511 * accumulated while sleeping.
1512 */
1513 if (unlikely(se->avg.decay_count <= 0)) {
1514 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 if (se->avg.decay_count) {
1516 /*
1517 * In a wake-up migration we have to approximate the
1518 * time sleeping. This is because we can't synchronize
1519 * clock_task between the two cpus, and it is not
1520 * guaranteed to be read-safe. Instead, we can
1521 * approximate this using our carried decays, which are
1522 * explicitly atomically readable.
1523 */
1524 se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 << 20;
1526 update_entity_load_avg(se, 0);
1527 /* Indicate that we're now synchronized and on-rq */
1528 se->avg.decay_count = 0;
1529 }
1530 wakeup = 0;
1531 } else {
1532 __synchronize_entity_decay(se);
1533 }
1534
1535 /* migrated tasks did not contribute to our blocked load */
1536 if (wakeup) {
1537 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 update_entity_load_avg(se, 0);
1539 }
1540
1541 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 /* we force update consideration on load-balancer moves */
1543 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544}
1545
1546/*
1547 * Remove se's load from this cfs_rq child load-average, if the entity is
1548 * transitioning to a blocked state we track its projected decay using
1549 * blocked_load_avg.
1550 */
1551static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 struct sched_entity *se,
1553 int sleep)
1554{
1555 update_entity_load_avg(se, 1);
1556 /* we force update consideration on load-balancer moves */
1557 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558
1559 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 if (sleep) {
1561 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564}
1565#else
1566static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {}
1568static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int wakeup) {}
1572static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 struct sched_entity *se,
1574 int sleep) {}
1575static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 int force_update) {}
1577#endif
1578
1579static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1580{
1581#ifdef CONFIG_SCHEDSTATS
1582 struct task_struct *tsk = NULL;
1583
1584 if (entity_is_task(se))
1585 tsk = task_of(se);
1586
1587 if (se->statistics.sleep_start) {
1588 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1589
1590 if ((s64)delta < 0)
1591 delta = 0;
1592
1593 if (unlikely(delta > se->statistics.sleep_max))
1594 se->statistics.sleep_max = delta;
1595
1596 se->statistics.sleep_start = 0;
1597 se->statistics.sum_sleep_runtime += delta;
1598
1599 if (tsk) {
1600 account_scheduler_latency(tsk, delta >> 10, 1);
1601 trace_sched_stat_sleep(tsk, delta);
1602 }
1603 }
1604 if (se->statistics.block_start) {
1605 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1606
1607 if ((s64)delta < 0)
1608 delta = 0;
1609
1610 if (unlikely(delta > se->statistics.block_max))
1611 se->statistics.block_max = delta;
1612
1613 se->statistics.block_start = 0;
1614 se->statistics.sum_sleep_runtime += delta;
1615
1616 if (tsk) {
1617 if (tsk->in_iowait) {
1618 se->statistics.iowait_sum += delta;
1619 se->statistics.iowait_count++;
1620 trace_sched_stat_iowait(tsk, delta);
1621 }
1622
1623 trace_sched_stat_blocked(tsk, delta);
1624
1625 /*
1626 * Blocking time is in units of nanosecs, so shift by
1627 * 20 to get a milliseconds-range estimation of the
1628 * amount of time that the task spent sleeping:
1629 */
1630 if (unlikely(prof_on == SLEEP_PROFILING)) {
1631 profile_hits(SLEEP_PROFILING,
1632 (void *)get_wchan(tsk),
1633 delta >> 20);
1634 }
1635 account_scheduler_latency(tsk, delta >> 10, 0);
1636 }
1637 }
1638#endif
1639}
1640
1641static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1642{
1643#ifdef CONFIG_SCHED_DEBUG
1644 s64 d = se->vruntime - cfs_rq->min_vruntime;
1645
1646 if (d < 0)
1647 d = -d;
1648
1649 if (d > 3*sysctl_sched_latency)
1650 schedstat_inc(cfs_rq, nr_spread_over);
1651#endif
1652}
1653
1654static void
1655place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1656{
1657 u64 vruntime = cfs_rq->min_vruntime;
1658
1659 /*
1660 * The 'current' period is already promised to the current tasks,
1661 * however the extra weight of the new task will slow them down a
1662 * little, place the new task so that it fits in the slot that
1663 * stays open at the end.
1664 */
1665 if (initial && sched_feat(START_DEBIT))
1666 vruntime += sched_vslice(cfs_rq, se);
1667
1668 /* sleeps up to a single latency don't count. */
1669 if (!initial) {
1670 unsigned long thresh = sysctl_sched_latency;
1671
1672 /*
1673 * Halve their sleep time's effect, to allow
1674 * for a gentler effect of sleepers:
1675 */
1676 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1677 thresh >>= 1;
1678
1679 vruntime -= thresh;
1680 }
1681
1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686}
1687
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1689
1690static void
1691enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1692{
1693 /*
1694 * Update the normalized vruntime before updating min_vruntime
1695 * through callig update_curr().
1696 */
1697 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1698 se->vruntime += cfs_rq->min_vruntime;
1699
1700 /*
1701 * Update run-time statistics of the 'current'.
1702 */
1703 update_curr(cfs_rq);
1704 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1705 account_entity_enqueue(cfs_rq, se);
1706 update_cfs_shares(cfs_rq);
1707
1708 if (flags & ENQUEUE_WAKEUP) {
1709 place_entity(cfs_rq, se, 0);
1710 enqueue_sleeper(cfs_rq, se);
1711 }
1712
1713 update_stats_enqueue(cfs_rq, se);
1714 check_spread(cfs_rq, se);
1715 if (se != cfs_rq->curr)
1716 __enqueue_entity(cfs_rq, se);
1717 se->on_rq = 1;
1718
1719 if (cfs_rq->nr_running == 1) {
1720 list_add_leaf_cfs_rq(cfs_rq);
1721 check_enqueue_throttle(cfs_rq);
1722 }
1723}
1724
1725static void __clear_buddies_last(struct sched_entity *se)
1726{
1727 for_each_sched_entity(se) {
1728 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1729 if (cfs_rq->last == se)
1730 cfs_rq->last = NULL;
1731 else
1732 break;
1733 }
1734}
1735
1736static void __clear_buddies_next(struct sched_entity *se)
1737{
1738 for_each_sched_entity(se) {
1739 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1740 if (cfs_rq->next == se)
1741 cfs_rq->next = NULL;
1742 else
1743 break;
1744 }
1745}
1746
1747static void __clear_buddies_skip(struct sched_entity *se)
1748{
1749 for_each_sched_entity(se) {
1750 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1751 if (cfs_rq->skip == se)
1752 cfs_rq->skip = NULL;
1753 else
1754 break;
1755 }
1756}
1757
1758static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1759{
1760 if (cfs_rq->last == se)
1761 __clear_buddies_last(se);
1762
1763 if (cfs_rq->next == se)
1764 __clear_buddies_next(se);
1765
1766 if (cfs_rq->skip == se)
1767 __clear_buddies_skip(se);
1768}
1769
1770static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1771
1772static void
1773dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1774{
1775 /*
1776 * Update run-time statistics of the 'current'.
1777 */
1778 update_curr(cfs_rq);
1779 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1780
1781 update_stats_dequeue(cfs_rq, se);
1782 if (flags & DEQUEUE_SLEEP) {
1783#ifdef CONFIG_SCHEDSTATS
1784 if (entity_is_task(se)) {
1785 struct task_struct *tsk = task_of(se);
1786
1787 if (tsk->state & TASK_INTERRUPTIBLE)
1788 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1789 if (tsk->state & TASK_UNINTERRUPTIBLE)
1790 se->statistics.block_start = rq_of(cfs_rq)->clock;
1791 }
1792#endif
1793 }
1794
1795 clear_buddies(cfs_rq, se);
1796
1797 if (se != cfs_rq->curr)
1798 __dequeue_entity(cfs_rq, se);
1799 se->on_rq = 0;
1800 account_entity_dequeue(cfs_rq, se);
1801
1802 /*
1803 * Normalize the entity after updating the min_vruntime because the
1804 * update can refer to the ->curr item and we need to reflect this
1805 * movement in our normalized position.
1806 */
1807 if (!(flags & DEQUEUE_SLEEP))
1808 se->vruntime -= cfs_rq->min_vruntime;
1809
1810 /* return excess runtime on last dequeue */
1811 return_cfs_rq_runtime(cfs_rq);
1812
1813 update_min_vruntime(cfs_rq);
1814 update_cfs_shares(cfs_rq);
1815}
1816
1817/*
1818 * Preempt the current task with a newly woken task if needed:
1819 */
1820static void
1821check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1822{
1823 unsigned long ideal_runtime, delta_exec;
1824 struct sched_entity *se;
1825 s64 delta;
1826
1827 ideal_runtime = sched_slice(cfs_rq, curr);
1828 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1829 if (delta_exec > ideal_runtime) {
1830 resched_task(rq_of(cfs_rq)->curr);
1831 /*
1832 * The current task ran long enough, ensure it doesn't get
1833 * re-elected due to buddy favours.
1834 */
1835 clear_buddies(cfs_rq, curr);
1836 return;
1837 }
1838
1839 /*
1840 * Ensure that a task that missed wakeup preemption by a
1841 * narrow margin doesn't have to wait for a full slice.
1842 * This also mitigates buddy induced latencies under load.
1843 */
1844 if (delta_exec < sysctl_sched_min_granularity)
1845 return;
1846
1847 se = __pick_first_entity(cfs_rq);
1848 delta = curr->vruntime - se->vruntime;
1849
1850 if (delta < 0)
1851 return;
1852
1853 if (delta > ideal_runtime)
1854 resched_task(rq_of(cfs_rq)->curr);
1855}
1856
1857static void
1858set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1859{
1860 /* 'current' is not kept within the tree. */
1861 if (se->on_rq) {
1862 /*
1863 * Any task has to be enqueued before it get to execute on
1864 * a CPU. So account for the time it spent waiting on the
1865 * runqueue.
1866 */
1867 update_stats_wait_end(cfs_rq, se);
1868 __dequeue_entity(cfs_rq, se);
1869 }
1870
1871 update_stats_curr_start(cfs_rq, se);
1872 cfs_rq->curr = se;
1873#ifdef CONFIG_SCHEDSTATS
1874 /*
1875 * Track our maximum slice length, if the CPU's load is at
1876 * least twice that of our own weight (i.e. dont track it
1877 * when there are only lesser-weight tasks around):
1878 */
1879 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1880 se->statistics.slice_max = max(se->statistics.slice_max,
1881 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1882 }
1883#endif
1884 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1885}
1886
1887static int
1888wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1889
1890/*
1891 * Pick the next process, keeping these things in mind, in this order:
1892 * 1) keep things fair between processes/task groups
1893 * 2) pick the "next" process, since someone really wants that to run
1894 * 3) pick the "last" process, for cache locality
1895 * 4) do not run the "skip" process, if something else is available
1896 */
1897static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1898{
1899 struct sched_entity *se = __pick_first_entity(cfs_rq);
1900 struct sched_entity *left = se;
1901
1902 /*
1903 * Avoid running the skip buddy, if running something else can
1904 * be done without getting too unfair.
1905 */
1906 if (cfs_rq->skip == se) {
1907 struct sched_entity *second = __pick_next_entity(se);
1908 if (second && wakeup_preempt_entity(second, left) < 1)
1909 se = second;
1910 }
1911
1912 /*
1913 * Prefer last buddy, try to return the CPU to a preempted task.
1914 */
1915 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1916 se = cfs_rq->last;
1917
1918 /*
1919 * Someone really wants this to run. If it's not unfair, run it.
1920 */
1921 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1922 se = cfs_rq->next;
1923
1924 clear_buddies(cfs_rq, se);
1925
1926 return se;
1927}
1928
1929static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1930
1931static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1932{
1933 /*
1934 * If still on the runqueue then deactivate_task()
1935 * was not called and update_curr() has to be done:
1936 */
1937 if (prev->on_rq)
1938 update_curr(cfs_rq);
1939
1940 /* throttle cfs_rqs exceeding runtime */
1941 check_cfs_rq_runtime(cfs_rq);
1942
1943 check_spread(cfs_rq, prev);
1944 if (prev->on_rq) {
1945 update_stats_wait_start(cfs_rq, prev);
1946 /* Put 'current' back into the tree. */
1947 __enqueue_entity(cfs_rq, prev);
1948 /* in !on_rq case, update occurred at dequeue */
1949 update_entity_load_avg(prev, 1);
1950 }
1951 cfs_rq->curr = NULL;
1952}
1953
1954static void
1955entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1956{
1957 /*
1958 * Update run-time statistics of the 'current'.
1959 */
1960 update_curr(cfs_rq);
1961
1962 /*
1963 * Ensure that runnable average is periodically updated.
1964 */
1965 update_entity_load_avg(curr, 1);
1966 update_cfs_rq_blocked_load(cfs_rq, 1);
1967
1968#ifdef CONFIG_SCHED_HRTICK
1969 /*
1970 * queued ticks are scheduled to match the slice, so don't bother
1971 * validating it and just reschedule.
1972 */
1973 if (queued) {
1974 resched_task(rq_of(cfs_rq)->curr);
1975 return;
1976 }
1977 /*
1978 * don't let the period tick interfere with the hrtick preemption
1979 */
1980 if (!sched_feat(DOUBLE_TICK) &&
1981 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1982 return;
1983#endif
1984
1985 if (cfs_rq->nr_running > 1)
1986 check_preempt_tick(cfs_rq, curr);
1987}
1988
1989
1990/**************************************************
1991 * CFS bandwidth control machinery
1992 */
1993
1994#ifdef CONFIG_CFS_BANDWIDTH
1995
1996#ifdef HAVE_JUMP_LABEL
1997static struct static_key __cfs_bandwidth_used;
1998
1999static inline bool cfs_bandwidth_used(void)
2000{
2001 return static_key_false(&__cfs_bandwidth_used);
2002}
2003
2004void account_cfs_bandwidth_used(int enabled, int was_enabled)
2005{
2006 /* only need to count groups transitioning between enabled/!enabled */
2007 if (enabled && !was_enabled)
2008 static_key_slow_inc(&__cfs_bandwidth_used);
2009 else if (!enabled && was_enabled)
2010 static_key_slow_dec(&__cfs_bandwidth_used);
2011}
2012#else /* HAVE_JUMP_LABEL */
2013static bool cfs_bandwidth_used(void)
2014{
2015 return true;
2016}
2017
2018void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2019#endif /* HAVE_JUMP_LABEL */
2020
2021/*
2022 * default period for cfs group bandwidth.
2023 * default: 0.1s, units: nanoseconds
2024 */
2025static inline u64 default_cfs_period(void)
2026{
2027 return 100000000ULL;
2028}
2029
2030static inline u64 sched_cfs_bandwidth_slice(void)
2031{
2032 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2033}
2034
2035/*
2036 * Replenish runtime according to assigned quota and update expiration time.
2037 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2038 * additional synchronization around rq->lock.
2039 *
2040 * requires cfs_b->lock
2041 */
2042void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2043{
2044 u64 now;
2045
2046 if (cfs_b->quota == RUNTIME_INF)
2047 return;
2048
2049 now = sched_clock_cpu(smp_processor_id());
2050 cfs_b->runtime = cfs_b->quota;
2051 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2052}
2053
2054static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2055{
2056 return &tg->cfs_bandwidth;
2057}
2058
2059/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2060static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2061{
2062 if (unlikely(cfs_rq->throttle_count))
2063 return cfs_rq->throttled_clock_task;
2064
2065 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2066}
2067
2068/* returns 0 on failure to allocate runtime */
2069static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2070{
2071 struct task_group *tg = cfs_rq->tg;
2072 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2073 u64 amount = 0, min_amount, expires;
2074
2075 /* note: this is a positive sum as runtime_remaining <= 0 */
2076 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2077
2078 raw_spin_lock(&cfs_b->lock);
2079 if (cfs_b->quota == RUNTIME_INF)
2080 amount = min_amount;
2081 else {
2082 /*
2083 * If the bandwidth pool has become inactive, then at least one
2084 * period must have elapsed since the last consumption.
2085 * Refresh the global state and ensure bandwidth timer becomes
2086 * active.
2087 */
2088 if (!cfs_b->timer_active) {
2089 __refill_cfs_bandwidth_runtime(cfs_b);
2090 __start_cfs_bandwidth(cfs_b);
2091 }
2092
2093 if (cfs_b->runtime > 0) {
2094 amount = min(cfs_b->runtime, min_amount);
2095 cfs_b->runtime -= amount;
2096 cfs_b->idle = 0;
2097 }
2098 }
2099 expires = cfs_b->runtime_expires;
2100 raw_spin_unlock(&cfs_b->lock);
2101
2102 cfs_rq->runtime_remaining += amount;
2103 /*
2104 * we may have advanced our local expiration to account for allowed
2105 * spread between our sched_clock and the one on which runtime was
2106 * issued.
2107 */
2108 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2109 cfs_rq->runtime_expires = expires;
2110
2111 return cfs_rq->runtime_remaining > 0;
2112}
2113
2114/*
2115 * Note: This depends on the synchronization provided by sched_clock and the
2116 * fact that rq->clock snapshots this value.
2117 */
2118static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2119{
2120 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2121 struct rq *rq = rq_of(cfs_rq);
2122
2123 /* if the deadline is ahead of our clock, nothing to do */
2124 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2125 return;
2126
2127 if (cfs_rq->runtime_remaining < 0)
2128 return;
2129
2130 /*
2131 * If the local deadline has passed we have to consider the
2132 * possibility that our sched_clock is 'fast' and the global deadline
2133 * has not truly expired.
2134 *
2135 * Fortunately we can check determine whether this the case by checking
2136 * whether the global deadline has advanced.
2137 */
2138
2139 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2140 /* extend local deadline, drift is bounded above by 2 ticks */
2141 cfs_rq->runtime_expires += TICK_NSEC;
2142 } else {
2143 /* global deadline is ahead, expiration has passed */
2144 cfs_rq->runtime_remaining = 0;
2145 }
2146}
2147
2148static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2149 unsigned long delta_exec)
2150{
2151 /* dock delta_exec before expiring quota (as it could span periods) */
2152 cfs_rq->runtime_remaining -= delta_exec;
2153 expire_cfs_rq_runtime(cfs_rq);
2154
2155 if (likely(cfs_rq->runtime_remaining > 0))
2156 return;
2157
2158 /*
2159 * if we're unable to extend our runtime we resched so that the active
2160 * hierarchy can be throttled
2161 */
2162 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2163 resched_task(rq_of(cfs_rq)->curr);
2164}
2165
2166static __always_inline
2167void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2168{
2169 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2170 return;
2171
2172 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2173}
2174
2175static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2176{
2177 return cfs_bandwidth_used() && cfs_rq->throttled;
2178}
2179
2180/* check whether cfs_rq, or any parent, is throttled */
2181static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2182{
2183 return cfs_bandwidth_used() && cfs_rq->throttle_count;
2184}
2185
2186/*
2187 * Ensure that neither of the group entities corresponding to src_cpu or
2188 * dest_cpu are members of a throttled hierarchy when performing group
2189 * load-balance operations.
2190 */
2191static inline int throttled_lb_pair(struct task_group *tg,
2192 int src_cpu, int dest_cpu)
2193{
2194 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2195
2196 src_cfs_rq = tg->cfs_rq[src_cpu];
2197 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2198
2199 return throttled_hierarchy(src_cfs_rq) ||
2200 throttled_hierarchy(dest_cfs_rq);
2201}
2202
2203/* updated child weight may affect parent so we have to do this bottom up */
2204static int tg_unthrottle_up(struct task_group *tg, void *data)
2205{
2206 struct rq *rq = data;
2207 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2208
2209 cfs_rq->throttle_count--;
2210#ifdef CONFIG_SMP
2211 if (!cfs_rq->throttle_count) {
2212 /* adjust cfs_rq_clock_task() */
2213 cfs_rq->throttled_clock_task_time += rq->clock_task -
2214 cfs_rq->throttled_clock_task;
2215 }
2216#endif
2217
2218 return 0;
2219}
2220
2221static int tg_throttle_down(struct task_group *tg, void *data)
2222{
2223 struct rq *rq = data;
2224 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2225
2226 /* group is entering throttled state, stop time */
2227 if (!cfs_rq->throttle_count)
2228 cfs_rq->throttled_clock_task = rq->clock_task;
2229 cfs_rq->throttle_count++;
2230
2231 return 0;
2232}
2233
2234static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2235{
2236 struct rq *rq = rq_of(cfs_rq);
2237 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2238 struct sched_entity *se;
2239 long task_delta, dequeue = 1;
2240
2241 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2242
2243 /* freeze hierarchy runnable averages while throttled */
2244 rcu_read_lock();
2245 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2246 rcu_read_unlock();
2247
2248 task_delta = cfs_rq->h_nr_running;
2249 for_each_sched_entity(se) {
2250 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2251 /* throttled entity or throttle-on-deactivate */
2252 if (!se->on_rq)
2253 break;
2254
2255 if (dequeue)
2256 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2257 qcfs_rq->h_nr_running -= task_delta;
2258
2259 if (qcfs_rq->load.weight)
2260 dequeue = 0;
2261 }
2262
2263 if (!se)
2264 rq->nr_running -= task_delta;
2265
2266 cfs_rq->throttled = 1;
2267 cfs_rq->throttled_clock = rq->clock;
2268 raw_spin_lock(&cfs_b->lock);
2269 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2270 raw_spin_unlock(&cfs_b->lock);
2271}
2272
2273void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2274{
2275 struct rq *rq = rq_of(cfs_rq);
2276 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2277 struct sched_entity *se;
2278 int enqueue = 1;
2279 long task_delta;
2280
2281 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2282
2283 cfs_rq->throttled = 0;
2284 raw_spin_lock(&cfs_b->lock);
2285 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2286 list_del_rcu(&cfs_rq->throttled_list);
2287 raw_spin_unlock(&cfs_b->lock);
2288
2289 update_rq_clock(rq);
2290 /* update hierarchical throttle state */
2291 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2292
2293 if (!cfs_rq->load.weight)
2294 return;
2295
2296 task_delta = cfs_rq->h_nr_running;
2297 for_each_sched_entity(se) {
2298 if (se->on_rq)
2299 enqueue = 0;
2300
2301 cfs_rq = cfs_rq_of(se);
2302 if (enqueue)
2303 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2304 cfs_rq->h_nr_running += task_delta;
2305
2306 if (cfs_rq_throttled(cfs_rq))
2307 break;
2308 }
2309
2310 if (!se)
2311 rq->nr_running += task_delta;
2312
2313 /* determine whether we need to wake up potentially idle cpu */
2314 if (rq->curr == rq->idle && rq->cfs.nr_running)
2315 resched_task(rq->curr);
2316}
2317
2318static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2319 u64 remaining, u64 expires)
2320{
2321 struct cfs_rq *cfs_rq;
2322 u64 runtime = remaining;
2323
2324 rcu_read_lock();
2325 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2326 throttled_list) {
2327 struct rq *rq = rq_of(cfs_rq);
2328
2329 raw_spin_lock(&rq->lock);
2330 if (!cfs_rq_throttled(cfs_rq))
2331 goto next;
2332
2333 runtime = -cfs_rq->runtime_remaining + 1;
2334 if (runtime > remaining)
2335 runtime = remaining;
2336 remaining -= runtime;
2337
2338 cfs_rq->runtime_remaining += runtime;
2339 cfs_rq->runtime_expires = expires;
2340
2341 /* we check whether we're throttled above */
2342 if (cfs_rq->runtime_remaining > 0)
2343 unthrottle_cfs_rq(cfs_rq);
2344
2345next:
2346 raw_spin_unlock(&rq->lock);
2347
2348 if (!remaining)
2349 break;
2350 }
2351 rcu_read_unlock();
2352
2353 return remaining;
2354}
2355
2356/*
2357 * Responsible for refilling a task_group's bandwidth and unthrottling its
2358 * cfs_rqs as appropriate. If there has been no activity within the last
2359 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2360 * used to track this state.
2361 */
2362static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2363{
2364 u64 runtime, runtime_expires;
2365 int idle = 1, throttled;
2366
2367 raw_spin_lock(&cfs_b->lock);
2368 /* no need to continue the timer with no bandwidth constraint */
2369 if (cfs_b->quota == RUNTIME_INF)
2370 goto out_unlock;
2371
2372 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2373 /* idle depends on !throttled (for the case of a large deficit) */
2374 idle = cfs_b->idle && !throttled;
2375 cfs_b->nr_periods += overrun;
2376
2377 /* if we're going inactive then everything else can be deferred */
2378 if (idle)
2379 goto out_unlock;
2380
2381 __refill_cfs_bandwidth_runtime(cfs_b);
2382
2383 if (!throttled) {
2384 /* mark as potentially idle for the upcoming period */
2385 cfs_b->idle = 1;
2386 goto out_unlock;
2387 }
2388
2389 /* account preceding periods in which throttling occurred */
2390 cfs_b->nr_throttled += overrun;
2391
2392 /*
2393 * There are throttled entities so we must first use the new bandwidth
2394 * to unthrottle them before making it generally available. This
2395 * ensures that all existing debts will be paid before a new cfs_rq is
2396 * allowed to run.
2397 */
2398 runtime = cfs_b->runtime;
2399 runtime_expires = cfs_b->runtime_expires;
2400 cfs_b->runtime = 0;
2401
2402 /*
2403 * This check is repeated as we are holding onto the new bandwidth
2404 * while we unthrottle. This can potentially race with an unthrottled
2405 * group trying to acquire new bandwidth from the global pool.
2406 */
2407 while (throttled && runtime > 0) {
2408 raw_spin_unlock(&cfs_b->lock);
2409 /* we can't nest cfs_b->lock while distributing bandwidth */
2410 runtime = distribute_cfs_runtime(cfs_b, runtime,
2411 runtime_expires);
2412 raw_spin_lock(&cfs_b->lock);
2413
2414 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2415 }
2416
2417 /* return (any) remaining runtime */
2418 cfs_b->runtime = runtime;
2419 /*
2420 * While we are ensured activity in the period following an
2421 * unthrottle, this also covers the case in which the new bandwidth is
2422 * insufficient to cover the existing bandwidth deficit. (Forcing the
2423 * timer to remain active while there are any throttled entities.)
2424 */
2425 cfs_b->idle = 0;
2426out_unlock:
2427 if (idle)
2428 cfs_b->timer_active = 0;
2429 raw_spin_unlock(&cfs_b->lock);
2430
2431 return idle;
2432}
2433
2434/* a cfs_rq won't donate quota below this amount */
2435static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2436/* minimum remaining period time to redistribute slack quota */
2437static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2438/* how long we wait to gather additional slack before distributing */
2439static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2440
2441/* are we near the end of the current quota period? */
2442static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2443{
2444 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2445 u64 remaining;
2446
2447 /* if the call-back is running a quota refresh is already occurring */
2448 if (hrtimer_callback_running(refresh_timer))
2449 return 1;
2450
2451 /* is a quota refresh about to occur? */
2452 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2453 if (remaining < min_expire)
2454 return 1;
2455
2456 return 0;
2457}
2458
2459static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2460{
2461 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2462
2463 /* if there's a quota refresh soon don't bother with slack */
2464 if (runtime_refresh_within(cfs_b, min_left))
2465 return;
2466
2467 start_bandwidth_timer(&cfs_b->slack_timer,
2468 ns_to_ktime(cfs_bandwidth_slack_period));
2469}
2470
2471/* we know any runtime found here is valid as update_curr() precedes return */
2472static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2473{
2474 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2475 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2476
2477 if (slack_runtime <= 0)
2478 return;
2479
2480 raw_spin_lock(&cfs_b->lock);
2481 if (cfs_b->quota != RUNTIME_INF &&
2482 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2483 cfs_b->runtime += slack_runtime;
2484
2485 /* we are under rq->lock, defer unthrottling using a timer */
2486 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2487 !list_empty(&cfs_b->throttled_cfs_rq))
2488 start_cfs_slack_bandwidth(cfs_b);
2489 }
2490 raw_spin_unlock(&cfs_b->lock);
2491
2492 /* even if it's not valid for return we don't want to try again */
2493 cfs_rq->runtime_remaining -= slack_runtime;
2494}
2495
2496static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2497{
2498 if (!cfs_bandwidth_used())
2499 return;
2500
2501 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2502 return;
2503
2504 __return_cfs_rq_runtime(cfs_rq);
2505}
2506
2507/*
2508 * This is done with a timer (instead of inline with bandwidth return) since
2509 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2510 */
2511static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2512{
2513 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2514 u64 expires;
2515
2516 /* confirm we're still not at a refresh boundary */
2517 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2518 return;
2519
2520 raw_spin_lock(&cfs_b->lock);
2521 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2522 runtime = cfs_b->runtime;
2523 cfs_b->runtime = 0;
2524 }
2525 expires = cfs_b->runtime_expires;
2526 raw_spin_unlock(&cfs_b->lock);
2527
2528 if (!runtime)
2529 return;
2530
2531 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2532
2533 raw_spin_lock(&cfs_b->lock);
2534 if (expires == cfs_b->runtime_expires)
2535 cfs_b->runtime = runtime;
2536 raw_spin_unlock(&cfs_b->lock);
2537}
2538
2539/*
2540 * When a group wakes up we want to make sure that its quota is not already
2541 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2542 * runtime as update_curr() throttling can not not trigger until it's on-rq.
2543 */
2544static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2545{
2546 if (!cfs_bandwidth_used())
2547 return;
2548
2549 /* an active group must be handled by the update_curr()->put() path */
2550 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2551 return;
2552
2553 /* ensure the group is not already throttled */
2554 if (cfs_rq_throttled(cfs_rq))
2555 return;
2556
2557 /* update runtime allocation */
2558 account_cfs_rq_runtime(cfs_rq, 0);
2559 if (cfs_rq->runtime_remaining <= 0)
2560 throttle_cfs_rq(cfs_rq);
2561}
2562
2563/* conditionally throttle active cfs_rq's from put_prev_entity() */
2564static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2565{
2566 if (!cfs_bandwidth_used())
2567 return;
2568
2569 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2570 return;
2571
2572 /*
2573 * it's possible for a throttled entity to be forced into a running
2574 * state (e.g. set_curr_task), in this case we're finished.
2575 */
2576 if (cfs_rq_throttled(cfs_rq))
2577 return;
2578
2579 throttle_cfs_rq(cfs_rq);
2580}
2581
2582static inline u64 default_cfs_period(void);
2583static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2584static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2585
2586static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2587{
2588 struct cfs_bandwidth *cfs_b =
2589 container_of(timer, struct cfs_bandwidth, slack_timer);
2590 do_sched_cfs_slack_timer(cfs_b);
2591
2592 return HRTIMER_NORESTART;
2593}
2594
2595static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2596{
2597 struct cfs_bandwidth *cfs_b =
2598 container_of(timer, struct cfs_bandwidth, period_timer);
2599 ktime_t now;
2600 int overrun;
2601 int idle = 0;
2602
2603 for (;;) {
2604 now = hrtimer_cb_get_time(timer);
2605 overrun = hrtimer_forward(timer, now, cfs_b->period);
2606
2607 if (!overrun)
2608 break;
2609
2610 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2611 }
2612
2613 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2614}
2615
2616void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2617{
2618 raw_spin_lock_init(&cfs_b->lock);
2619 cfs_b->runtime = 0;
2620 cfs_b->quota = RUNTIME_INF;
2621 cfs_b->period = ns_to_ktime(default_cfs_period());
2622
2623 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2624 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2625 cfs_b->period_timer.function = sched_cfs_period_timer;
2626 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2627 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2628}
2629
2630static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2631{
2632 cfs_rq->runtime_enabled = 0;
2633 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2634}
2635
2636/* requires cfs_b->lock, may release to reprogram timer */
2637void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2638{
2639 /*
2640 * The timer may be active because we're trying to set a new bandwidth
2641 * period or because we're racing with the tear-down path
2642 * (timer_active==0 becomes visible before the hrtimer call-back
2643 * terminates). In either case we ensure that it's re-programmed
2644 */
2645 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2646 raw_spin_unlock(&cfs_b->lock);
2647 /* ensure cfs_b->lock is available while we wait */
2648 hrtimer_cancel(&cfs_b->period_timer);
2649
2650 raw_spin_lock(&cfs_b->lock);
2651 /* if someone else restarted the timer then we're done */
2652 if (cfs_b->timer_active)
2653 return;
2654 }
2655
2656 cfs_b->timer_active = 1;
2657 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2658}
2659
2660static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2661{
2662 hrtimer_cancel(&cfs_b->period_timer);
2663 hrtimer_cancel(&cfs_b->slack_timer);
2664}
2665
2666static void unthrottle_offline_cfs_rqs(struct rq *rq)
2667{
2668 struct cfs_rq *cfs_rq;
2669
2670 for_each_leaf_cfs_rq(rq, cfs_rq) {
2671 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2672
2673 if (!cfs_rq->runtime_enabled)
2674 continue;
2675
2676 /*
2677 * clock_task is not advancing so we just need to make sure
2678 * there's some valid quota amount
2679 */
2680 cfs_rq->runtime_remaining = cfs_b->quota;
2681 if (cfs_rq_throttled(cfs_rq))
2682 unthrottle_cfs_rq(cfs_rq);
2683 }
2684}
2685
2686#else /* CONFIG_CFS_BANDWIDTH */
2687static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2688{
2689 return rq_of(cfs_rq)->clock_task;
2690}
2691
2692static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2693 unsigned long delta_exec) {}
2694static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2695static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2696static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2697
2698static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2699{
2700 return 0;
2701}
2702
2703static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2704{
2705 return 0;
2706}
2707
2708static inline int throttled_lb_pair(struct task_group *tg,
2709 int src_cpu, int dest_cpu)
2710{
2711 return 0;
2712}
2713
2714void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2715
2716#ifdef CONFIG_FAIR_GROUP_SCHED
2717static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2718#endif
2719
2720static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2721{
2722 return NULL;
2723}
2724static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2725static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2726
2727#endif /* CONFIG_CFS_BANDWIDTH */
2728
2729/**************************************************
2730 * CFS operations on tasks:
2731 */
2732
2733#ifdef CONFIG_SCHED_HRTICK
2734static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2735{
2736 struct sched_entity *se = &p->se;
2737 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2738
2739 WARN_ON(task_rq(p) != rq);
2740
2741 if (cfs_rq->nr_running > 1) {
2742 u64 slice = sched_slice(cfs_rq, se);
2743 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2744 s64 delta = slice - ran;
2745
2746 if (delta < 0) {
2747 if (rq->curr == p)
2748 resched_task(p);
2749 return;
2750 }
2751
2752 /*
2753 * Don't schedule slices shorter than 10000ns, that just
2754 * doesn't make sense. Rely on vruntime for fairness.
2755 */
2756 if (rq->curr != p)
2757 delta = max_t(s64, 10000LL, delta);
2758
2759 hrtick_start(rq, delta);
2760 }
2761}
2762
2763/*
2764 * called from enqueue/dequeue and updates the hrtick when the
2765 * current task is from our class and nr_running is low enough
2766 * to matter.
2767 */
2768static void hrtick_update(struct rq *rq)
2769{
2770 struct task_struct *curr = rq->curr;
2771
2772 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2773 return;
2774
2775 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2776 hrtick_start_fair(rq, curr);
2777}
2778#else /* !CONFIG_SCHED_HRTICK */
2779static inline void
2780hrtick_start_fair(struct rq *rq, struct task_struct *p)
2781{
2782}
2783
2784static inline void hrtick_update(struct rq *rq)
2785{
2786}
2787#endif
2788
2789/*
2790 * The enqueue_task method is called before nr_running is
2791 * increased. Here we update the fair scheduling stats and
2792 * then put the task into the rbtree:
2793 */
2794static void
2795enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2796{
2797 struct cfs_rq *cfs_rq;
2798 struct sched_entity *se = &p->se;
2799
2800 for_each_sched_entity(se) {
2801 if (se->on_rq)
2802 break;
2803 cfs_rq = cfs_rq_of(se);
2804 enqueue_entity(cfs_rq, se, flags);
2805
2806 /*
2807 * end evaluation on encountering a throttled cfs_rq
2808 *
2809 * note: in the case of encountering a throttled cfs_rq we will
2810 * post the final h_nr_running increment below.
2811 */
2812 if (cfs_rq_throttled(cfs_rq))
2813 break;
2814 cfs_rq->h_nr_running++;
2815
2816 flags = ENQUEUE_WAKEUP;
2817 }
2818
2819 for_each_sched_entity(se) {
2820 cfs_rq = cfs_rq_of(se);
2821 cfs_rq->h_nr_running++;
2822
2823 if (cfs_rq_throttled(cfs_rq))
2824 break;
2825
2826 update_cfs_shares(cfs_rq);
2827 update_entity_load_avg(se, 1);
2828 }
2829
2830 if (!se) {
2831 update_rq_runnable_avg(rq, rq->nr_running);
2832 inc_nr_running(rq);
2833 }
2834 hrtick_update(rq);
2835}
2836
2837static void set_next_buddy(struct sched_entity *se);
2838
2839/*
2840 * The dequeue_task method is called before nr_running is
2841 * decreased. We remove the task from the rbtree and
2842 * update the fair scheduling stats:
2843 */
2844static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2845{
2846 struct cfs_rq *cfs_rq;
2847 struct sched_entity *se = &p->se;
2848 int task_sleep = flags & DEQUEUE_SLEEP;
2849
2850 for_each_sched_entity(se) {
2851 cfs_rq = cfs_rq_of(se);
2852 dequeue_entity(cfs_rq, se, flags);
2853
2854 /*
2855 * end evaluation on encountering a throttled cfs_rq
2856 *
2857 * note: in the case of encountering a throttled cfs_rq we will
2858 * post the final h_nr_running decrement below.
2859 */
2860 if (cfs_rq_throttled(cfs_rq))
2861 break;
2862 cfs_rq->h_nr_running--;
2863
2864 /* Don't dequeue parent if it has other entities besides us */
2865 if (cfs_rq->load.weight) {
2866 /*
2867 * Bias pick_next to pick a task from this cfs_rq, as
2868 * p is sleeping when it is within its sched_slice.
2869 */
2870 if (task_sleep && parent_entity(se))
2871 set_next_buddy(parent_entity(se));
2872
2873 /* avoid re-evaluating load for this entity */
2874 se = parent_entity(se);
2875 break;
2876 }
2877 flags |= DEQUEUE_SLEEP;
2878 }
2879
2880 for_each_sched_entity(se) {
2881 cfs_rq = cfs_rq_of(se);
2882 cfs_rq->h_nr_running--;
2883
2884 if (cfs_rq_throttled(cfs_rq))
2885 break;
2886
2887 update_cfs_shares(cfs_rq);
2888 update_entity_load_avg(se, 1);
2889 }
2890
2891 if (!se) {
2892 dec_nr_running(rq);
2893 update_rq_runnable_avg(rq, 1);
2894 }
2895 hrtick_update(rq);
2896}
2897
2898#ifdef CONFIG_SMP
2899/* Used instead of source_load when we know the type == 0 */
2900static unsigned long weighted_cpuload(const int cpu)
2901{
2902 return cpu_rq(cpu)->load.weight;
2903}
2904
2905/*
2906 * Return a low guess at the load of a migration-source cpu weighted
2907 * according to the scheduling class and "nice" value.
2908 *
2909 * We want to under-estimate the load of migration sources, to
2910 * balance conservatively.
2911 */
2912static unsigned long source_load(int cpu, int type)
2913{
2914 struct rq *rq = cpu_rq(cpu);
2915 unsigned long total = weighted_cpuload(cpu);
2916
2917 if (type == 0 || !sched_feat(LB_BIAS))
2918 return total;
2919
2920 return min(rq->cpu_load[type-1], total);
2921}
2922
2923/*
2924 * Return a high guess at the load of a migration-target cpu weighted
2925 * according to the scheduling class and "nice" value.
2926 */
2927static unsigned long target_load(int cpu, int type)
2928{
2929 struct rq *rq = cpu_rq(cpu);
2930 unsigned long total = weighted_cpuload(cpu);
2931
2932 if (type == 0 || !sched_feat(LB_BIAS))
2933 return total;
2934
2935 return max(rq->cpu_load[type-1], total);
2936}
2937
2938static unsigned long power_of(int cpu)
2939{
2940 return cpu_rq(cpu)->cpu_power;
2941}
2942
2943static unsigned long cpu_avg_load_per_task(int cpu)
2944{
2945 struct rq *rq = cpu_rq(cpu);
2946 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2947
2948 if (nr_running)
2949 return rq->load.weight / nr_running;
2950
2951 return 0;
2952}
2953
2954
2955static void task_waking_fair(struct task_struct *p)
2956{
2957 struct sched_entity *se = &p->se;
2958 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2959 u64 min_vruntime;
2960
2961#ifndef CONFIG_64BIT
2962 u64 min_vruntime_copy;
2963
2964 do {
2965 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2966 smp_rmb();
2967 min_vruntime = cfs_rq->min_vruntime;
2968 } while (min_vruntime != min_vruntime_copy);
2969#else
2970 min_vruntime = cfs_rq->min_vruntime;
2971#endif
2972
2973 se->vruntime -= min_vruntime;
2974}
2975
2976#ifdef CONFIG_FAIR_GROUP_SCHED
2977/*
2978 * effective_load() calculates the load change as seen from the root_task_group
2979 *
2980 * Adding load to a group doesn't make a group heavier, but can cause movement
2981 * of group shares between cpus. Assuming the shares were perfectly aligned one
2982 * can calculate the shift in shares.
2983 *
2984 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2985 * on this @cpu and results in a total addition (subtraction) of @wg to the
2986 * total group weight.
2987 *
2988 * Given a runqueue weight distribution (rw_i) we can compute a shares
2989 * distribution (s_i) using:
2990 *
2991 * s_i = rw_i / \Sum rw_j (1)
2992 *
2993 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2994 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2995 * shares distribution (s_i):
2996 *
2997 * rw_i = { 2, 4, 1, 0 }
2998 * s_i = { 2/7, 4/7, 1/7, 0 }
2999 *
3000 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3001 * task used to run on and the CPU the waker is running on), we need to
3002 * compute the effect of waking a task on either CPU and, in case of a sync
3003 * wakeup, compute the effect of the current task going to sleep.
3004 *
3005 * So for a change of @wl to the local @cpu with an overall group weight change
3006 * of @wl we can compute the new shares distribution (s'_i) using:
3007 *
3008 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
3009 *
3010 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3011 * differences in waking a task to CPU 0. The additional task changes the
3012 * weight and shares distributions like:
3013 *
3014 * rw'_i = { 3, 4, 1, 0 }
3015 * s'_i = { 3/8, 4/8, 1/8, 0 }
3016 *
3017 * We can then compute the difference in effective weight by using:
3018 *
3019 * dw_i = S * (s'_i - s_i) (3)
3020 *
3021 * Where 'S' is the group weight as seen by its parent.
3022 *
3023 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3024 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3025 * 4/7) times the weight of the group.
3026 */
3027static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3028{
3029 struct sched_entity *se = tg->se[cpu];
3030
3031 if (!tg->parent) /* the trivial, non-cgroup case */
3032 return wl;
3033
3034 for_each_sched_entity(se) {
3035 long w, W;
3036
3037 tg = se->my_q->tg;
3038
3039 /*
3040 * W = @wg + \Sum rw_j
3041 */
3042 W = wg + calc_tg_weight(tg, se->my_q);
3043
3044 /*
3045 * w = rw_i + @wl
3046 */
3047 w = se->my_q->load.weight + wl;
3048
3049 /*
3050 * wl = S * s'_i; see (2)
3051 */
3052 if (W > 0 && w < W)
3053 wl = (w * tg->shares) / W;
3054 else
3055 wl = tg->shares;
3056
3057 /*
3058 * Per the above, wl is the new se->load.weight value; since
3059 * those are clipped to [MIN_SHARES, ...) do so now. See
3060 * calc_cfs_shares().
3061 */
3062 if (wl < MIN_SHARES)
3063 wl = MIN_SHARES;
3064
3065 /*
3066 * wl = dw_i = S * (s'_i - s_i); see (3)
3067 */
3068 wl -= se->load.weight;
3069
3070 /*
3071 * Recursively apply this logic to all parent groups to compute
3072 * the final effective load change on the root group. Since
3073 * only the @tg group gets extra weight, all parent groups can
3074 * only redistribute existing shares. @wl is the shift in shares
3075 * resulting from this level per the above.
3076 */
3077 wg = 0;
3078 }
3079
3080 return wl;
3081}
3082#else
3083
3084static inline unsigned long effective_load(struct task_group *tg, int cpu,
3085 unsigned long wl, unsigned long wg)
3086{
3087 return wl;
3088}
3089
3090#endif
3091
3092static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3093{
3094 s64 this_load, load;
3095 int idx, this_cpu, prev_cpu;
3096 unsigned long tl_per_task;
3097 struct task_group *tg;
3098 unsigned long weight;
3099 int balanced;
3100
3101 idx = sd->wake_idx;
3102 this_cpu = smp_processor_id();
3103 prev_cpu = task_cpu(p);
3104 load = source_load(prev_cpu, idx);
3105 this_load = target_load(this_cpu, idx);
3106
3107 /*
3108 * If sync wakeup then subtract the (maximum possible)
3109 * effect of the currently running task from the load
3110 * of the current CPU:
3111 */
3112 if (sync) {
3113 tg = task_group(current);
3114 weight = current->se.load.weight;
3115
3116 this_load += effective_load(tg, this_cpu, -weight, -weight);
3117 load += effective_load(tg, prev_cpu, 0, -weight);
3118 }
3119
3120 tg = task_group(p);
3121 weight = p->se.load.weight;
3122
3123 /*
3124 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3125 * due to the sync cause above having dropped this_load to 0, we'll
3126 * always have an imbalance, but there's really nothing you can do
3127 * about that, so that's good too.
3128 *
3129 * Otherwise check if either cpus are near enough in load to allow this
3130 * task to be woken on this_cpu.
3131 */
3132 if (this_load > 0) {
3133 s64 this_eff_load, prev_eff_load;
3134
3135 this_eff_load = 100;
3136 this_eff_load *= power_of(prev_cpu);
3137 this_eff_load *= this_load +
3138 effective_load(tg, this_cpu, weight, weight);
3139
3140 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3141 prev_eff_load *= power_of(this_cpu);
3142 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3143
3144 balanced = this_eff_load <= prev_eff_load;
3145 } else
3146 balanced = true;
3147
3148 /*
3149 * If the currently running task will sleep within
3150 * a reasonable amount of time then attract this newly
3151 * woken task:
3152 */
3153 if (sync && balanced)
3154 return 1;
3155
3156 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3157 tl_per_task = cpu_avg_load_per_task(this_cpu);
3158
3159 if (balanced ||
3160 (this_load <= load &&
3161 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3162 /*
3163 * This domain has SD_WAKE_AFFINE and
3164 * p is cache cold in this domain, and
3165 * there is no bad imbalance.
3166 */
3167 schedstat_inc(sd, ttwu_move_affine);
3168 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3169
3170 return 1;
3171 }
3172 return 0;
3173}
3174
3175/*
3176 * find_idlest_group finds and returns the least busy CPU group within the
3177 * domain.
3178 */
3179static struct sched_group *
3180find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3181 int this_cpu, int load_idx)
3182{
3183 struct sched_group *idlest = NULL, *group = sd->groups;
3184 unsigned long min_load = ULONG_MAX, this_load = 0;
3185 int imbalance = 100 + (sd->imbalance_pct-100)/2;
3186
3187 do {
3188 unsigned long load, avg_load;
3189 int local_group;
3190 int i;
3191
3192 /* Skip over this group if it has no CPUs allowed */
3193 if (!cpumask_intersects(sched_group_cpus(group),
3194 tsk_cpus_allowed(p)))
3195 continue;
3196
3197 local_group = cpumask_test_cpu(this_cpu,
3198 sched_group_cpus(group));
3199
3200 /* Tally up the load of all CPUs in the group */
3201 avg_load = 0;
3202
3203 for_each_cpu(i, sched_group_cpus(group)) {
3204 /* Bias balancing toward cpus of our domain */
3205 if (local_group)
3206 load = source_load(i, load_idx);
3207 else
3208 load = target_load(i, load_idx);
3209
3210 avg_load += load;
3211 }
3212
3213 /* Adjust by relative CPU power of the group */
3214 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3215
3216 if (local_group) {
3217 this_load = avg_load;
3218 } else if (avg_load < min_load) {
3219 min_load = avg_load;
3220 idlest = group;
3221 }
3222 } while (group = group->next, group != sd->groups);
3223
3224 if (!idlest || 100*this_load < imbalance*min_load)
3225 return NULL;
3226 return idlest;
3227}
3228
3229/*
3230 * find_idlest_cpu - find the idlest cpu among the cpus in group.
3231 */
3232static int
3233find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3234{
3235 unsigned long load, min_load = ULONG_MAX;
3236 int idlest = -1;
3237 int i;
3238
3239 /* Traverse only the allowed CPUs */
3240 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3241 load = weighted_cpuload(i);
3242
3243 if (load < min_load || (load == min_load && i == this_cpu)) {
3244 min_load = load;
3245 idlest = i;
3246 }
3247 }
3248
3249 return idlest;
3250}
3251
3252/*
3253 * Try and locate an idle CPU in the sched_domain.
3254 */
3255static int select_idle_sibling(struct task_struct *p, int target)
3256{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd;
3260 struct sched_group *sg;
3261 int i;
3262
3263 /*
3264 * If the task is going to be woken-up on this cpu and if it is
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269
3270 /*
3271 * If the task is going to be woken-up on the cpu where it previously
3272 * ran and if it is currently idle, then it the right target.
3273 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu))
3275 return prev_cpu;
3276
3277 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu.
3279 */
3280 sd = rcu_dereference(per_cpu(sd_llc, target));
3281 for_each_lower_domain(sd) {
3282 sg = sd->groups;
3283 do {
3284 if (!cpumask_intersects(sched_group_cpus(sg),
3285 tsk_cpus_allowed(p)))
3286 goto next;
3287
3288 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i))
3290 goto next;
3291 }
3292
3293 target = cpumask_first_and(sched_group_cpus(sg),
3294 tsk_cpus_allowed(p));
3295 goto done;
3296next:
3297 sg = sg->next;
3298 } while (sg != sd->groups);
3299 }
3300done:
3301 return target;
3302}
3303
3304/*
3305 * sched_balance_self: balance the current task (running on cpu) in domains
3306 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3307 * SD_BALANCE_EXEC.
3308 *
3309 * Balance, ie. select the least loaded group.
3310 *
3311 * Returns the target CPU number, or the same CPU if no balancing is needed.
3312 *
3313 * preempt must be disabled.
3314 */
3315static int
3316select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3317{
3318 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3319 int cpu = smp_processor_id();
3320 int prev_cpu = task_cpu(p);
3321 int new_cpu = cpu;
3322 int want_affine = 0;
3323 int sync = wake_flags & WF_SYNC;
3324
3325 if (p->nr_cpus_allowed == 1)
3326 return prev_cpu;
3327
3328 if (sd_flag & SD_BALANCE_WAKE) {
3329 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3330 want_affine = 1;
3331 new_cpu = prev_cpu;
3332 }
3333
3334 rcu_read_lock();
3335 for_each_domain(cpu, tmp) {
3336 if (!(tmp->flags & SD_LOAD_BALANCE))
3337 continue;
3338
3339 /*
3340 * If both cpu and prev_cpu are part of this domain,
3341 * cpu is a valid SD_WAKE_AFFINE target.
3342 */
3343 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3344 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3345 affine_sd = tmp;
3346 break;
3347 }
3348
3349 if (tmp->flags & sd_flag)
3350 sd = tmp;
3351 }
3352
3353 if (affine_sd) {
3354 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3355 prev_cpu = cpu;
3356
3357 new_cpu = select_idle_sibling(p, prev_cpu);
3358 goto unlock;
3359 }
3360
3361 while (sd) {
3362 int load_idx = sd->forkexec_idx;
3363 struct sched_group *group;
3364 int weight;
3365
3366 if (!(sd->flags & sd_flag)) {
3367 sd = sd->child;
3368 continue;
3369 }
3370
3371 if (sd_flag & SD_BALANCE_WAKE)
3372 load_idx = sd->wake_idx;
3373
3374 group = find_idlest_group(sd, p, cpu, load_idx);
3375 if (!group) {
3376 sd = sd->child;
3377 continue;
3378 }
3379
3380 new_cpu = find_idlest_cpu(group, p, cpu);
3381 if (new_cpu == -1 || new_cpu == cpu) {
3382 /* Now try balancing at a lower domain level of cpu */
3383 sd = sd->child;
3384 continue;
3385 }
3386
3387 /* Now try balancing at a lower domain level of new_cpu */
3388 cpu = new_cpu;
3389 weight = sd->span_weight;
3390 sd = NULL;
3391 for_each_domain(cpu, tmp) {
3392 if (weight <= tmp->span_weight)
3393 break;
3394 if (tmp->flags & sd_flag)
3395 sd = tmp;
3396 }
3397 /* while loop will break here if sd == NULL */
3398 }
3399unlock:
3400 rcu_read_unlock();
3401
3402 return new_cpu;
3403}
3404
3405/*
3406 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3407 * removed when useful for applications beyond shares distribution (e.g.
3408 * load-balance).
3409 */
3410#ifdef CONFIG_FAIR_GROUP_SCHED
3411/*
3412 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3413 * cfs_rq_of(p) references at time of call are still valid and identify the
3414 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3415 * other assumptions, including the state of rq->lock, should be made.
3416 */
3417static void
3418migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3419{
3420 struct sched_entity *se = &p->se;
3421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3422
3423 /*
3424 * Load tracking: accumulate removed load so that it can be processed
3425 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3426 * to blocked load iff they have a positive decay-count. It can never
3427 * be negative here since on-rq tasks have decay-count == 0.
3428 */
3429 if (se->avg.decay_count) {
3430 se->avg.decay_count = -__synchronize_entity_decay(se);
3431 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3432 }
3433}
3434#endif
3435#endif /* CONFIG_SMP */
3436
3437static unsigned long
3438wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3439{
3440 unsigned long gran = sysctl_sched_wakeup_granularity;
3441
3442 /*
3443 * Since its curr running now, convert the gran from real-time
3444 * to virtual-time in his units.
3445 *
3446 * By using 'se' instead of 'curr' we penalize light tasks, so
3447 * they get preempted easier. That is, if 'se' < 'curr' then
3448 * the resulting gran will be larger, therefore penalizing the
3449 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3450 * be smaller, again penalizing the lighter task.
3451 *
3452 * This is especially important for buddies when the leftmost
3453 * task is higher priority than the buddy.
3454 */
3455 return calc_delta_fair(gran, se);
3456}
3457
3458/*
3459 * Should 'se' preempt 'curr'.
3460 *
3461 * |s1
3462 * |s2
3463 * |s3
3464 * g
3465 * |<--->|c
3466 *
3467 * w(c, s1) = -1
3468 * w(c, s2) = 0
3469 * w(c, s3) = 1
3470 *
3471 */
3472static int
3473wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3474{
3475 s64 gran, vdiff = curr->vruntime - se->vruntime;
3476
3477 if (vdiff <= 0)
3478 return -1;
3479
3480 gran = wakeup_gran(curr, se);
3481 if (vdiff > gran)
3482 return 1;
3483
3484 return 0;
3485}
3486
3487static void set_last_buddy(struct sched_entity *se)
3488{
3489 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3490 return;
3491
3492 for_each_sched_entity(se)
3493 cfs_rq_of(se)->last = se;
3494}
3495
3496static void set_next_buddy(struct sched_entity *se)
3497{
3498 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3499 return;
3500
3501 for_each_sched_entity(se)
3502 cfs_rq_of(se)->next = se;
3503}
3504
3505static void set_skip_buddy(struct sched_entity *se)
3506{
3507 for_each_sched_entity(se)
3508 cfs_rq_of(se)->skip = se;
3509}
3510
3511/*
3512 * Preempt the current task with a newly woken task if needed:
3513 */
3514static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3515{
3516 struct task_struct *curr = rq->curr;
3517 struct sched_entity *se = &curr->se, *pse = &p->se;
3518 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3519 int scale = cfs_rq->nr_running >= sched_nr_latency;
3520 int next_buddy_marked = 0;
3521
3522 if (unlikely(se == pse))
3523 return;
3524
3525 /*
3526 * This is possible from callers such as move_task(), in which we
3527 * unconditionally check_prempt_curr() after an enqueue (which may have
3528 * lead to a throttle). This both saves work and prevents false
3529 * next-buddy nomination below.
3530 */
3531 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3532 return;
3533
3534 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3535 set_next_buddy(pse);
3536 next_buddy_marked = 1;
3537 }
3538
3539 /*
3540 * We can come here with TIF_NEED_RESCHED already set from new task
3541 * wake up path.
3542 *
3543 * Note: this also catches the edge-case of curr being in a throttled
3544 * group (e.g. via set_curr_task), since update_curr() (in the
3545 * enqueue of curr) will have resulted in resched being set. This
3546 * prevents us from potentially nominating it as a false LAST_BUDDY
3547 * below.
3548 */
3549 if (test_tsk_need_resched(curr))
3550 return;
3551
3552 /* Idle tasks are by definition preempted by non-idle tasks. */
3553 if (unlikely(curr->policy == SCHED_IDLE) &&
3554 likely(p->policy != SCHED_IDLE))
3555 goto preempt;
3556
3557 /*
3558 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3559 * is driven by the tick):
3560 */
3561 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3562 return;
3563
3564 find_matching_se(&se, &pse);
3565 update_curr(cfs_rq_of(se));
3566 BUG_ON(!pse);
3567 if (wakeup_preempt_entity(se, pse) == 1) {
3568 /*
3569 * Bias pick_next to pick the sched entity that is
3570 * triggering this preemption.
3571 */
3572 if (!next_buddy_marked)
3573 set_next_buddy(pse);
3574 goto preempt;
3575 }
3576
3577 return;
3578
3579preempt:
3580 resched_task(curr);
3581 /*
3582 * Only set the backward buddy when the current task is still
3583 * on the rq. This can happen when a wakeup gets interleaved
3584 * with schedule on the ->pre_schedule() or idle_balance()
3585 * point, either of which can * drop the rq lock.
3586 *
3587 * Also, during early boot the idle thread is in the fair class,
3588 * for obvious reasons its a bad idea to schedule back to it.
3589 */
3590 if (unlikely(!se->on_rq || curr == rq->idle))
3591 return;
3592
3593 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3594 set_last_buddy(se);
3595}
3596
3597static struct task_struct *pick_next_task_fair(struct rq *rq)
3598{
3599 struct task_struct *p;
3600 struct cfs_rq *cfs_rq = &rq->cfs;
3601 struct sched_entity *se;
3602
3603 if (!cfs_rq->nr_running)
3604 return NULL;
3605
3606 do {
3607 se = pick_next_entity(cfs_rq);
3608 set_next_entity(cfs_rq, se);
3609 cfs_rq = group_cfs_rq(se);
3610 } while (cfs_rq);
3611
3612 p = task_of(se);
3613 if (hrtick_enabled(rq))
3614 hrtick_start_fair(rq, p);
3615
3616 return p;
3617}
3618
3619/*
3620 * Account for a descheduled task:
3621 */
3622static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3623{
3624 struct sched_entity *se = &prev->se;
3625 struct cfs_rq *cfs_rq;
3626
3627 for_each_sched_entity(se) {
3628 cfs_rq = cfs_rq_of(se);
3629 put_prev_entity(cfs_rq, se);
3630 }
3631}
3632
3633/*
3634 * sched_yield() is very simple
3635 *
3636 * The magic of dealing with the ->skip buddy is in pick_next_entity.
3637 */
3638static void yield_task_fair(struct rq *rq)
3639{
3640 struct task_struct *curr = rq->curr;
3641 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3642 struct sched_entity *se = &curr->se;
3643
3644 /*
3645 * Are we the only task in the tree?
3646 */
3647 if (unlikely(rq->nr_running == 1))
3648 return;
3649
3650 clear_buddies(cfs_rq, se);
3651
3652 if (curr->policy != SCHED_BATCH) {
3653 update_rq_clock(rq);
3654 /*
3655 * Update run-time statistics of the 'current'.
3656 */
3657 update_curr(cfs_rq);
3658 /*
3659 * Tell update_rq_clock() that we've just updated,
3660 * so we don't do microscopic update in schedule()
3661 * and double the fastpath cost.
3662 */
3663 rq->skip_clock_update = 1;
3664 }
3665
3666 set_skip_buddy(se);
3667}
3668
3669static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3670{
3671 struct sched_entity *se = &p->se;
3672
3673 /* throttled hierarchies are not runnable */
3674 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3675 return false;
3676
3677 /* Tell the scheduler that we'd really like pse to run next. */
3678 set_next_buddy(se);
3679
3680 yield_task_fair(rq);
3681
3682 return true;
3683}
3684
3685#ifdef CONFIG_SMP
3686/**************************************************
3687 * Fair scheduling class load-balancing methods.
3688 *
3689 * BASICS
3690 *
3691 * The purpose of load-balancing is to achieve the same basic fairness the
3692 * per-cpu scheduler provides, namely provide a proportional amount of compute
3693 * time to each task. This is expressed in the following equation:
3694 *
3695 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3696 *
3697 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3698 * W_i,0 is defined as:
3699 *
3700 * W_i,0 = \Sum_j w_i,j (2)
3701 *
3702 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3703 * is derived from the nice value as per prio_to_weight[].
3704 *
3705 * The weight average is an exponential decay average of the instantaneous
3706 * weight:
3707 *
3708 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3709 *
3710 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3711 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3712 * can also include other factors [XXX].
3713 *
3714 * To achieve this balance we define a measure of imbalance which follows
3715 * directly from (1):
3716 *
3717 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3718 *
3719 * We them move tasks around to minimize the imbalance. In the continuous
3720 * function space it is obvious this converges, in the discrete case we get
3721 * a few fun cases generally called infeasible weight scenarios.
3722 *
3723 * [XXX expand on:
3724 * - infeasible weights;
3725 * - local vs global optima in the discrete case. ]
3726 *
3727 *
3728 * SCHED DOMAINS
3729 *
3730 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3731 * for all i,j solution, we create a tree of cpus that follows the hardware
3732 * topology where each level pairs two lower groups (or better). This results
3733 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3734 * tree to only the first of the previous level and we decrease the frequency
3735 * of load-balance at each level inv. proportional to the number of cpus in
3736 * the groups.
3737 *
3738 * This yields:
3739 *
3740 * log_2 n 1 n
3741 * \Sum { --- * --- * 2^i } = O(n) (5)
3742 * i = 0 2^i 2^i
3743 * `- size of each group
3744 * | | `- number of cpus doing load-balance
3745 * | `- freq
3746 * `- sum over all levels
3747 *
3748 * Coupled with a limit on how many tasks we can migrate every balance pass,
3749 * this makes (5) the runtime complexity of the balancer.
3750 *
3751 * An important property here is that each CPU is still (indirectly) connected
3752 * to every other cpu in at most O(log n) steps:
3753 *
3754 * The adjacency matrix of the resulting graph is given by:
3755 *
3756 * log_2 n
3757 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3758 * k = 0
3759 *
3760 * And you'll find that:
3761 *
3762 * A^(log_2 n)_i,j != 0 for all i,j (7)
3763 *
3764 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3765 * The task movement gives a factor of O(m), giving a convergence complexity
3766 * of:
3767 *
3768 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3769 *
3770 *
3771 * WORK CONSERVING
3772 *
3773 * In order to avoid CPUs going idle while there's still work to do, new idle
3774 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3775 * tree itself instead of relying on other CPUs to bring it work.
3776 *
3777 * This adds some complexity to both (5) and (8) but it reduces the total idle
3778 * time.
3779 *
3780 * [XXX more?]
3781 *
3782 *
3783 * CGROUPS
3784 *
3785 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3786 *
3787 * s_k,i
3788 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3789 * S_k
3790 *
3791 * Where
3792 *
3793 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3794 *
3795 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3796 *
3797 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3798 * property.
3799 *
3800 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3801 * rewrite all of this once again.]
3802 */
3803
3804static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3805
3806#define LBF_ALL_PINNED 0x01
3807#define LBF_NEED_BREAK 0x02
3808#define LBF_SOME_PINNED 0x04
3809
3810struct lb_env {
3811 struct sched_domain *sd;
3812
3813 struct rq *src_rq;
3814 int src_cpu;
3815
3816 int dst_cpu;
3817 struct rq *dst_rq;
3818
3819 struct cpumask *dst_grpmask;
3820 int new_dst_cpu;
3821 enum cpu_idle_type idle;
3822 long imbalance;
3823 /* The set of CPUs under consideration for load-balancing */
3824 struct cpumask *cpus;
3825
3826 unsigned int flags;
3827
3828 unsigned int loop;
3829 unsigned int loop_break;
3830 unsigned int loop_max;
3831};
3832
3833/*
3834 * move_task - move a task from one runqueue to another runqueue.
3835 * Both runqueues must be locked.
3836 */
3837static void move_task(struct task_struct *p, struct lb_env *env)
3838{
3839 deactivate_task(env->src_rq, p, 0);
3840 set_task_cpu(p, env->dst_cpu);
3841 activate_task(env->dst_rq, p, 0);
3842 check_preempt_curr(env->dst_rq, p, 0);
3843}
3844
3845/*
3846 * Is this task likely cache-hot:
3847 */
3848static int
3849task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3850{
3851 s64 delta;
3852
3853 if (p->sched_class != &fair_sched_class)
3854 return 0;
3855
3856 if (unlikely(p->policy == SCHED_IDLE))
3857 return 0;
3858
3859 /*
3860 * Buddy candidates are cache hot:
3861 */
3862 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3863 (&p->se == cfs_rq_of(&p->se)->next ||
3864 &p->se == cfs_rq_of(&p->se)->last))
3865 return 1;
3866
3867 if (sysctl_sched_migration_cost == -1)
3868 return 1;
3869 if (sysctl_sched_migration_cost == 0)
3870 return 0;
3871
3872 delta = now - p->se.exec_start;
3873
3874 return delta < (s64)sysctl_sched_migration_cost;
3875}
3876
3877/*
3878 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3879 */
3880static
3881int can_migrate_task(struct task_struct *p, struct lb_env *env)
3882{
3883 int tsk_cache_hot = 0;
3884 /*
3885 * We do not migrate tasks that are:
3886 * 1) running (obviously), or
3887 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3888 * 3) are cache-hot on their current CPU.
3889 */
3890 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3891 int new_dst_cpu;
3892
3893 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3894
3895 /*
3896 * Remember if this task can be migrated to any other cpu in
3897 * our sched_group. We may want to revisit it if we couldn't
3898 * meet load balance goals by pulling other tasks on src_cpu.
3899 *
3900 * Also avoid computing new_dst_cpu if we have already computed
3901 * one in current iteration.
3902 */
3903 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3904 return 0;
3905
3906 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3907 tsk_cpus_allowed(p));
3908 if (new_dst_cpu < nr_cpu_ids) {
3909 env->flags |= LBF_SOME_PINNED;
3910 env->new_dst_cpu = new_dst_cpu;
3911 }
3912 return 0;
3913 }
3914
3915 /* Record that we found atleast one task that could run on dst_cpu */
3916 env->flags &= ~LBF_ALL_PINNED;
3917
3918 if (task_running(env->src_rq, p)) {
3919 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3920 return 0;
3921 }
3922
3923 /*
3924 * Aggressive migration if:
3925 * 1) task is cache cold, or
3926 * 2) too many balance attempts have failed.
3927 */
3928
3929 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3930 if (!tsk_cache_hot ||
3931 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3932#ifdef CONFIG_SCHEDSTATS
3933 if (tsk_cache_hot) {
3934 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3935 schedstat_inc(p, se.statistics.nr_forced_migrations);
3936 }
3937#endif
3938 return 1;
3939 }
3940
3941 if (tsk_cache_hot) {
3942 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3943 return 0;
3944 }
3945 return 1;
3946}
3947
3948/*
3949 * move_one_task tries to move exactly one task from busiest to this_rq, as
3950 * part of active balancing operations within "domain".
3951 * Returns 1 if successful and 0 otherwise.
3952 *
3953 * Called with both runqueues locked.
3954 */
3955static int move_one_task(struct lb_env *env)
3956{
3957 struct task_struct *p, *n;
3958
3959 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3960 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3961 continue;
3962
3963 if (!can_migrate_task(p, env))
3964 continue;
3965
3966 move_task(p, env);
3967 /*
3968 * Right now, this is only the second place move_task()
3969 * is called, so we can safely collect move_task()
3970 * stats here rather than inside move_task().
3971 */
3972 schedstat_inc(env->sd, lb_gained[env->idle]);
3973 return 1;
3974 }
3975 return 0;
3976}
3977
3978static unsigned long task_h_load(struct task_struct *p);
3979
3980static const unsigned int sched_nr_migrate_break = 32;
3981
3982/*
3983 * move_tasks tries to move up to imbalance weighted load from busiest to
3984 * this_rq, as part of a balancing operation within domain "sd".
3985 * Returns 1 if successful and 0 otherwise.
3986 *
3987 * Called with both runqueues locked.
3988 */
3989static int move_tasks(struct lb_env *env)
3990{
3991 struct list_head *tasks = &env->src_rq->cfs_tasks;
3992 struct task_struct *p;
3993 unsigned long load;
3994 int pulled = 0;
3995
3996 if (env->imbalance <= 0)
3997 return 0;
3998
3999 while (!list_empty(tasks)) {
4000 p = list_first_entry(tasks, struct task_struct, se.group_node);
4001
4002 env->loop++;
4003 /* We've more or less seen every task there is, call it quits */
4004 if (env->loop > env->loop_max)
4005 break;
4006
4007 /* take a breather every nr_migrate tasks */
4008 if (env->loop > env->loop_break) {
4009 env->loop_break += sched_nr_migrate_break;
4010 env->flags |= LBF_NEED_BREAK;
4011 break;
4012 }
4013
4014 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4015 goto next;
4016
4017 load = task_h_load(p);
4018
4019 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4020 goto next;
4021
4022 if ((load / 2) > env->imbalance)
4023 goto next;
4024
4025 if (!can_migrate_task(p, env))
4026 goto next;
4027
4028 move_task(p, env);
4029 pulled++;
4030 env->imbalance -= load;
4031
4032#ifdef CONFIG_PREEMPT
4033 /*
4034 * NEWIDLE balancing is a source of latency, so preemptible
4035 * kernels will stop after the first task is pulled to minimize
4036 * the critical section.
4037 */
4038 if (env->idle == CPU_NEWLY_IDLE)
4039 break;
4040#endif
4041
4042 /*
4043 * We only want to steal up to the prescribed amount of
4044 * weighted load.
4045 */
4046 if (env->imbalance <= 0)
4047 break;
4048
4049 continue;
4050next:
4051 list_move_tail(&p->se.group_node, tasks);
4052 }
4053
4054 /*
4055 * Right now, this is one of only two places move_task() is called,
4056 * so we can safely collect move_task() stats here rather than
4057 * inside move_task().
4058 */
4059 schedstat_add(env->sd, lb_gained[env->idle], pulled);
4060
4061 return pulled;
4062}
4063
4064#ifdef CONFIG_FAIR_GROUP_SCHED
4065/*
4066 * update tg->load_weight by folding this cpu's load_avg
4067 */
4068static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4069{
4070 struct sched_entity *se = tg->se[cpu];
4071 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4072
4073 /* throttled entities do not contribute to load */
4074 if (throttled_hierarchy(cfs_rq))
4075 return;
4076
4077 update_cfs_rq_blocked_load(cfs_rq, 1);
4078
4079 if (se) {
4080 update_entity_load_avg(se, 1);
4081 /*
4082 * We pivot on our runnable average having decayed to zero for
4083 * list removal. This generally implies that all our children
4084 * have also been removed (modulo rounding error or bandwidth
4085 * control); however, such cases are rare and we can fix these
4086 * at enqueue.
4087 *
4088 * TODO: fix up out-of-order children on enqueue.
4089 */
4090 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4091 list_del_leaf_cfs_rq(cfs_rq);
4092 } else {
4093 struct rq *rq = rq_of(cfs_rq);
4094 update_rq_runnable_avg(rq, rq->nr_running);
4095 }
4096}
4097
4098static void update_blocked_averages(int cpu)
4099{
4100 struct rq *rq = cpu_rq(cpu);
4101 struct cfs_rq *cfs_rq;
4102 unsigned long flags;
4103
4104 raw_spin_lock_irqsave(&rq->lock, flags);
4105 update_rq_clock(rq);
4106 /*
4107 * Iterates the task_group tree in a bottom up fashion, see
4108 * list_add_leaf_cfs_rq() for details.
4109 */
4110 for_each_leaf_cfs_rq(rq, cfs_rq) {
4111 /*
4112 * Note: We may want to consider periodically releasing
4113 * rq->lock about these updates so that creating many task
4114 * groups does not result in continually extending hold time.
4115 */
4116 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4117 }
4118
4119 raw_spin_unlock_irqrestore(&rq->lock, flags);
4120}
4121
4122/*
4123 * Compute the cpu's hierarchical load factor for each task group.
4124 * This needs to be done in a top-down fashion because the load of a child
4125 * group is a fraction of its parents load.
4126 */
4127static int tg_load_down(struct task_group *tg, void *data)
4128{
4129 unsigned long load;
4130 long cpu = (long)data;
4131
4132 if (!tg->parent) {
4133 load = cpu_rq(cpu)->load.weight;
4134 } else {
4135 load = tg->parent->cfs_rq[cpu]->h_load;
4136 load *= tg->se[cpu]->load.weight;
4137 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4138 }
4139
4140 tg->cfs_rq[cpu]->h_load = load;
4141
4142 return 0;
4143}
4144
4145static void update_h_load(long cpu)
4146{
4147 struct rq *rq = cpu_rq(cpu);
4148 unsigned long now = jiffies;
4149
4150 if (rq->h_load_throttle == now)
4151 return;
4152
4153 rq->h_load_throttle = now;
4154
4155 rcu_read_lock();
4156 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4157 rcu_read_unlock();
4158}
4159
4160static unsigned long task_h_load(struct task_struct *p)
4161{
4162 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4163 unsigned long load;
4164
4165 load = p->se.load.weight;
4166 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4167
4168 return load;
4169}
4170#else
4171static inline void update_blocked_averages(int cpu)
4172{
4173}
4174
4175static inline void update_h_load(long cpu)
4176{
4177}
4178
4179static unsigned long task_h_load(struct task_struct *p)
4180{
4181 return p->se.load.weight;
4182}
4183#endif
4184
4185/********** Helpers for find_busiest_group ************************/
4186/*
4187 * sd_lb_stats - Structure to store the statistics of a sched_domain
4188 * during load balancing.
4189 */
4190struct sd_lb_stats {
4191 struct sched_group *busiest; /* Busiest group in this sd */
4192 struct sched_group *this; /* Local group in this sd */
4193 unsigned long total_load; /* Total load of all groups in sd */
4194 unsigned long total_pwr; /* Total power of all groups in sd */
4195 unsigned long avg_load; /* Average load across all groups in sd */
4196
4197 /** Statistics of this group */
4198 unsigned long this_load;
4199 unsigned long this_load_per_task;
4200 unsigned long this_nr_running;
4201 unsigned long this_has_capacity;
4202 unsigned int this_idle_cpus;
4203
4204 /* Statistics of the busiest group */
4205 unsigned int busiest_idle_cpus;
4206 unsigned long max_load;
4207 unsigned long busiest_load_per_task;
4208 unsigned long busiest_nr_running;
4209 unsigned long busiest_group_capacity;
4210 unsigned long busiest_has_capacity;
4211 unsigned int busiest_group_weight;
4212
4213 int group_imb; /* Is there imbalance in this sd */
4214};
4215
4216/*
4217 * sg_lb_stats - stats of a sched_group required for load_balancing
4218 */
4219struct sg_lb_stats {
4220 unsigned long avg_load; /*Avg load across the CPUs of the group */
4221 unsigned long group_load; /* Total load over the CPUs of the group */
4222 unsigned long sum_nr_running; /* Nr tasks running in the group */
4223 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4224 unsigned long group_capacity;
4225 unsigned long idle_cpus;
4226 unsigned long group_weight;
4227 int group_imb; /* Is there an imbalance in the group ? */
4228 int group_has_capacity; /* Is there extra capacity in the group? */
4229};
4230
4231/**
4232 * get_sd_load_idx - Obtain the load index for a given sched domain.
4233 * @sd: The sched_domain whose load_idx is to be obtained.
4234 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4235 */
4236static inline int get_sd_load_idx(struct sched_domain *sd,
4237 enum cpu_idle_type idle)
4238{
4239 int load_idx;
4240
4241 switch (idle) {
4242 case CPU_NOT_IDLE:
4243 load_idx = sd->busy_idx;
4244 break;
4245
4246 case CPU_NEWLY_IDLE:
4247 load_idx = sd->newidle_idx;
4248 break;
4249 default:
4250 load_idx = sd->idle_idx;
4251 break;
4252 }
4253
4254 return load_idx;
4255}
4256
4257unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4258{
4259 return SCHED_POWER_SCALE;
4260}
4261
4262unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4263{
4264 return default_scale_freq_power(sd, cpu);
4265}
4266
4267unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4268{
4269 unsigned long weight = sd->span_weight;
4270 unsigned long smt_gain = sd->smt_gain;
4271
4272 smt_gain /= weight;
4273
4274 return smt_gain;
4275}
4276
4277unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4278{
4279 return default_scale_smt_power(sd, cpu);
4280}
4281
4282unsigned long scale_rt_power(int cpu)
4283{
4284 struct rq *rq = cpu_rq(cpu);
4285 u64 total, available, age_stamp, avg;
4286
4287 /*
4288 * Since we're reading these variables without serialization make sure
4289 * we read them once before doing sanity checks on them.
4290 */
4291 age_stamp = ACCESS_ONCE(rq->age_stamp);
4292 avg = ACCESS_ONCE(rq->rt_avg);
4293
4294 total = sched_avg_period() + (rq->clock - age_stamp);
4295
4296 if (unlikely(total < avg)) {
4297 /* Ensures that power won't end up being negative */
4298 available = 0;
4299 } else {
4300 available = total - avg;
4301 }
4302
4303 if (unlikely((s64)total < SCHED_POWER_SCALE))
4304 total = SCHED_POWER_SCALE;
4305
4306 total >>= SCHED_POWER_SHIFT;
4307
4308 return div_u64(available, total);
4309}
4310
4311static void update_cpu_power(struct sched_domain *sd, int cpu)
4312{
4313 unsigned long weight = sd->span_weight;
4314 unsigned long power = SCHED_POWER_SCALE;
4315 struct sched_group *sdg = sd->groups;
4316
4317 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4318 if (sched_feat(ARCH_POWER))
4319 power *= arch_scale_smt_power(sd, cpu);
4320 else
4321 power *= default_scale_smt_power(sd, cpu);
4322
4323 power >>= SCHED_POWER_SHIFT;
4324 }
4325
4326 sdg->sgp->power_orig = power;
4327
4328 if (sched_feat(ARCH_POWER))
4329 power *= arch_scale_freq_power(sd, cpu);
4330 else
4331 power *= default_scale_freq_power(sd, cpu);
4332
4333 power >>= SCHED_POWER_SHIFT;
4334
4335 power *= scale_rt_power(cpu);
4336 power >>= SCHED_POWER_SHIFT;
4337
4338 if (!power)
4339 power = 1;
4340
4341 cpu_rq(cpu)->cpu_power = power;
4342 sdg->sgp->power = power;
4343}
4344
4345void update_group_power(struct sched_domain *sd, int cpu)
4346{
4347 struct sched_domain *child = sd->child;
4348 struct sched_group *group, *sdg = sd->groups;
4349 unsigned long power;
4350 unsigned long interval;
4351
4352 interval = msecs_to_jiffies(sd->balance_interval);
4353 interval = clamp(interval, 1UL, max_load_balance_interval);
4354 sdg->sgp->next_update = jiffies + interval;
4355
4356 if (!child) {
4357 update_cpu_power(sd, cpu);
4358 return;
4359 }
4360
4361 power = 0;
4362
4363 if (child->flags & SD_OVERLAP) {
4364 /*
4365 * SD_OVERLAP domains cannot assume that child groups
4366 * span the current group.
4367 */
4368
4369 for_each_cpu(cpu, sched_group_cpus(sdg))
4370 power += power_of(cpu);
4371 } else {
4372 /*
4373 * !SD_OVERLAP domains can assume that child groups
4374 * span the current group.
4375 */
4376
4377 group = child->groups;
4378 do {
4379 power += group->sgp->power;
4380 group = group->next;
4381 } while (group != child->groups);
4382 }
4383
4384 sdg->sgp->power_orig = sdg->sgp->power = power;
4385}
4386
4387/*
4388 * Try and fix up capacity for tiny siblings, this is needed when
4389 * things like SD_ASYM_PACKING need f_b_g to select another sibling
4390 * which on its own isn't powerful enough.
4391 *
4392 * See update_sd_pick_busiest() and check_asym_packing().
4393 */
4394static inline int
4395fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4396{
4397 /*
4398 * Only siblings can have significantly less than SCHED_POWER_SCALE
4399 */
4400 if (!(sd->flags & SD_SHARE_CPUPOWER))
4401 return 0;
4402
4403 /*
4404 * If ~90% of the cpu_power is still there, we're good.
4405 */
4406 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4407 return 1;
4408
4409 return 0;
4410}
4411
4412/**
4413 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4414 * @env: The load balancing environment.
4415 * @group: sched_group whose statistics are to be updated.
4416 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4417 * @local_group: Does group contain this_cpu.
4418 * @balance: Should we balance.
4419 * @sgs: variable to hold the statistics for this group.
4420 */
4421static inline void update_sg_lb_stats(struct lb_env *env,
4422 struct sched_group *group, int load_idx,
4423 int local_group, int *balance, struct sg_lb_stats *sgs)
4424{
4425 unsigned long nr_running, max_nr_running, min_nr_running;
4426 unsigned long load, max_cpu_load, min_cpu_load;
4427 unsigned int balance_cpu = -1, first_idle_cpu = 0;
4428 unsigned long avg_load_per_task = 0;
4429 int i;
4430
4431 if (local_group)
4432 balance_cpu = group_balance_cpu(group);
4433
4434 /* Tally up the load of all CPUs in the group */
4435 max_cpu_load = 0;
4436 min_cpu_load = ~0UL;
4437 max_nr_running = 0;
4438 min_nr_running = ~0UL;
4439
4440 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4441 struct rq *rq = cpu_rq(i);
4442
4443 nr_running = rq->nr_running;
4444
4445 /* Bias balancing toward cpus of our domain */
4446 if (local_group) {
4447 if (idle_cpu(i) && !first_idle_cpu &&
4448 cpumask_test_cpu(i, sched_group_mask(group))) {
4449 first_idle_cpu = 1;
4450 balance_cpu = i;
4451 }
4452
4453 load = target_load(i, load_idx);
4454 } else {
4455 load = source_load(i, load_idx);
4456 if (load > max_cpu_load)
4457 max_cpu_load = load;
4458 if (min_cpu_load > load)
4459 min_cpu_load = load;
4460
4461 if (nr_running > max_nr_running)
4462 max_nr_running = nr_running;
4463 if (min_nr_running > nr_running)
4464 min_nr_running = nr_running;
4465 }
4466
4467 sgs->group_load += load;
4468 sgs->sum_nr_running += nr_running;
4469 sgs->sum_weighted_load += weighted_cpuload(i);
4470 if (idle_cpu(i))
4471 sgs->idle_cpus++;
4472 }
4473
4474 /*
4475 * First idle cpu or the first cpu(busiest) in this sched group
4476 * is eligible for doing load balancing at this and above
4477 * domains. In the newly idle case, we will allow all the cpu's
4478 * to do the newly idle load balance.
4479 */
4480 if (local_group) {
4481 if (env->idle != CPU_NEWLY_IDLE) {
4482 if (balance_cpu != env->dst_cpu) {
4483 *balance = 0;
4484 return;
4485 }
4486 update_group_power(env->sd, env->dst_cpu);
4487 } else if (time_after_eq(jiffies, group->sgp->next_update))
4488 update_group_power(env->sd, env->dst_cpu);
4489 }
4490
4491 /* Adjust by relative CPU power of the group */
4492 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4493
4494 /*
4495 * Consider the group unbalanced when the imbalance is larger
4496 * than the average weight of a task.
4497 *
4498 * APZ: with cgroup the avg task weight can vary wildly and
4499 * might not be a suitable number - should we keep a
4500 * normalized nr_running number somewhere that negates
4501 * the hierarchy?
4502 */
4503 if (sgs->sum_nr_running)
4504 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4505
4506 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4507 (max_nr_running - min_nr_running) > 1)
4508 sgs->group_imb = 1;
4509
4510 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4511 SCHED_POWER_SCALE);
4512 if (!sgs->group_capacity)
4513 sgs->group_capacity = fix_small_capacity(env->sd, group);
4514 sgs->group_weight = group->group_weight;
4515
4516 if (sgs->group_capacity > sgs->sum_nr_running)
4517 sgs->group_has_capacity = 1;
4518}
4519
4520/**
4521 * update_sd_pick_busiest - return 1 on busiest group
4522 * @env: The load balancing environment.
4523 * @sds: sched_domain statistics
4524 * @sg: sched_group candidate to be checked for being the busiest
4525 * @sgs: sched_group statistics
4526 *
4527 * Determine if @sg is a busier group than the previously selected
4528 * busiest group.
4529 */
4530static bool update_sd_pick_busiest(struct lb_env *env,
4531 struct sd_lb_stats *sds,
4532 struct sched_group *sg,
4533 struct sg_lb_stats *sgs)
4534{
4535 if (sgs->avg_load <= sds->max_load)
4536 return false;
4537
4538 if (sgs->sum_nr_running > sgs->group_capacity)
4539 return true;
4540
4541 if (sgs->group_imb)
4542 return true;
4543
4544 /*
4545 * ASYM_PACKING needs to move all the work to the lowest
4546 * numbered CPUs in the group, therefore mark all groups
4547 * higher than ourself as busy.
4548 */
4549 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4550 env->dst_cpu < group_first_cpu(sg)) {
4551 if (!sds->busiest)
4552 return true;
4553
4554 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4555 return true;
4556 }
4557
4558 return false;
4559}
4560
4561/**
4562 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4563 * @env: The load balancing environment.
4564 * @balance: Should we balance.
4565 * @sds: variable to hold the statistics for this sched_domain.
4566 */
4567static inline void update_sd_lb_stats(struct lb_env *env,
4568 int *balance, struct sd_lb_stats *sds)
4569{
4570 struct sched_domain *child = env->sd->child;
4571 struct sched_group *sg = env->sd->groups;
4572 struct sg_lb_stats sgs;
4573 int load_idx, prefer_sibling = 0;
4574
4575 if (child && child->flags & SD_PREFER_SIBLING)
4576 prefer_sibling = 1;
4577
4578 load_idx = get_sd_load_idx(env->sd, env->idle);
4579
4580 do {
4581 int local_group;
4582
4583 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4584 memset(&sgs, 0, sizeof(sgs));
4585 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4586
4587 if (local_group && !(*balance))
4588 return;
4589
4590 sds->total_load += sgs.group_load;
4591 sds->total_pwr += sg->sgp->power;
4592
4593 /*
4594 * In case the child domain prefers tasks go to siblings
4595 * first, lower the sg capacity to one so that we'll try
4596 * and move all the excess tasks away. We lower the capacity
4597 * of a group only if the local group has the capacity to fit
4598 * these excess tasks, i.e. nr_running < group_capacity. The
4599 * extra check prevents the case where you always pull from the
4600 * heaviest group when it is already under-utilized (possible
4601 * with a large weight task outweighs the tasks on the system).
4602 */
4603 if (prefer_sibling && !local_group && sds->this_has_capacity)
4604 sgs.group_capacity = min(sgs.group_capacity, 1UL);
4605
4606 if (local_group) {
4607 sds->this_load = sgs.avg_load;
4608 sds->this = sg;
4609 sds->this_nr_running = sgs.sum_nr_running;
4610 sds->this_load_per_task = sgs.sum_weighted_load;
4611 sds->this_has_capacity = sgs.group_has_capacity;
4612 sds->this_idle_cpus = sgs.idle_cpus;
4613 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4614 sds->max_load = sgs.avg_load;
4615 sds->busiest = sg;
4616 sds->busiest_nr_running = sgs.sum_nr_running;
4617 sds->busiest_idle_cpus = sgs.idle_cpus;
4618 sds->busiest_group_capacity = sgs.group_capacity;
4619 sds->busiest_load_per_task = sgs.sum_weighted_load;
4620 sds->busiest_has_capacity = sgs.group_has_capacity;
4621 sds->busiest_group_weight = sgs.group_weight;
4622 sds->group_imb = sgs.group_imb;
4623 }
4624
4625 sg = sg->next;
4626 } while (sg != env->sd->groups);
4627}
4628
4629/**
4630 * check_asym_packing - Check to see if the group is packed into the
4631 * sched doman.
4632 *
4633 * This is primarily intended to used at the sibling level. Some
4634 * cores like POWER7 prefer to use lower numbered SMT threads. In the
4635 * case of POWER7, it can move to lower SMT modes only when higher
4636 * threads are idle. When in lower SMT modes, the threads will
4637 * perform better since they share less core resources. Hence when we
4638 * have idle threads, we want them to be the higher ones.
4639 *
4640 * This packing function is run on idle threads. It checks to see if
4641 * the busiest CPU in this domain (core in the P7 case) has a higher
4642 * CPU number than the packing function is being run on. Here we are
4643 * assuming lower CPU number will be equivalent to lower a SMT thread
4644 * number.
4645 *
4646 * Returns 1 when packing is required and a task should be moved to
4647 * this CPU. The amount of the imbalance is returned in *imbalance.
4648 *
4649 * @env: The load balancing environment.
4650 * @sds: Statistics of the sched_domain which is to be packed
4651 */
4652static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4653{
4654 int busiest_cpu;
4655
4656 if (!(env->sd->flags & SD_ASYM_PACKING))
4657 return 0;
4658
4659 if (!sds->busiest)
4660 return 0;
4661
4662 busiest_cpu = group_first_cpu(sds->busiest);
4663 if (env->dst_cpu > busiest_cpu)
4664 return 0;
4665
4666 env->imbalance = DIV_ROUND_CLOSEST(
4667 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4668
4669 return 1;
4670}
4671
4672/**
4673 * fix_small_imbalance - Calculate the minor imbalance that exists
4674 * amongst the groups of a sched_domain, during
4675 * load balancing.
4676 * @env: The load balancing environment.
4677 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4678 */
4679static inline
4680void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4681{
4682 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4683 unsigned int imbn = 2;
4684 unsigned long scaled_busy_load_per_task;
4685
4686 if (sds->this_nr_running) {
4687 sds->this_load_per_task /= sds->this_nr_running;
4688 if (sds->busiest_load_per_task >
4689 sds->this_load_per_task)
4690 imbn = 1;
4691 } else {
4692 sds->this_load_per_task =
4693 cpu_avg_load_per_task(env->dst_cpu);
4694 }
4695
4696 scaled_busy_load_per_task = sds->busiest_load_per_task
4697 * SCHED_POWER_SCALE;
4698 scaled_busy_load_per_task /= sds->busiest->sgp->power;
4699
4700 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4701 (scaled_busy_load_per_task * imbn)) {
4702 env->imbalance = sds->busiest_load_per_task;
4703 return;
4704 }
4705
4706 /*
4707 * OK, we don't have enough imbalance to justify moving tasks,
4708 * however we may be able to increase total CPU power used by
4709 * moving them.
4710 */
4711
4712 pwr_now += sds->busiest->sgp->power *
4713 min(sds->busiest_load_per_task, sds->max_load);
4714 pwr_now += sds->this->sgp->power *
4715 min(sds->this_load_per_task, sds->this_load);
4716 pwr_now /= SCHED_POWER_SCALE;
4717
4718 /* Amount of load we'd subtract */
4719 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4720 sds->busiest->sgp->power;
4721 if (sds->max_load > tmp)
4722 pwr_move += sds->busiest->sgp->power *
4723 min(sds->busiest_load_per_task, sds->max_load - tmp);
4724
4725 /* Amount of load we'd add */
4726 if (sds->max_load * sds->busiest->sgp->power <
4727 sds->busiest_load_per_task * SCHED_POWER_SCALE)
4728 tmp = (sds->max_load * sds->busiest->sgp->power) /
4729 sds->this->sgp->power;
4730 else
4731 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4732 sds->this->sgp->power;
4733 pwr_move += sds->this->sgp->power *
4734 min(sds->this_load_per_task, sds->this_load + tmp);
4735 pwr_move /= SCHED_POWER_SCALE;
4736
4737 /* Move if we gain throughput */
4738 if (pwr_move > pwr_now)
4739 env->imbalance = sds->busiest_load_per_task;
4740}
4741
4742/**
4743 * calculate_imbalance - Calculate the amount of imbalance present within the
4744 * groups of a given sched_domain during load balance.
4745 * @env: load balance environment
4746 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4747 */
4748static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4749{
4750 unsigned long max_pull, load_above_capacity = ~0UL;
4751
4752 sds->busiest_load_per_task /= sds->busiest_nr_running;
4753 if (sds->group_imb) {
4754 sds->busiest_load_per_task =
4755 min(sds->busiest_load_per_task, sds->avg_load);
4756 }
4757
4758 /*
4759 * In the presence of smp nice balancing, certain scenarios can have
4760 * max load less than avg load(as we skip the groups at or below
4761 * its cpu_power, while calculating max_load..)
4762 */
4763 if (sds->max_load < sds->avg_load) {
4764 env->imbalance = 0;
4765 return fix_small_imbalance(env, sds);
4766 }
4767
4768 if (!sds->group_imb) {
4769 /*
4770 * Don't want to pull so many tasks that a group would go idle.
4771 */
4772 load_above_capacity = (sds->busiest_nr_running -
4773 sds->busiest_group_capacity);
4774
4775 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4776
4777 load_above_capacity /= sds->busiest->sgp->power;
4778 }
4779
4780 /*
4781 * We're trying to get all the cpus to the average_load, so we don't
4782 * want to push ourselves above the average load, nor do we wish to
4783 * reduce the max loaded cpu below the average load. At the same time,
4784 * we also don't want to reduce the group load below the group capacity
4785 * (so that we can implement power-savings policies etc). Thus we look
4786 * for the minimum possible imbalance.
4787 * Be careful of negative numbers as they'll appear as very large values
4788 * with unsigned longs.
4789 */
4790 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4791
4792 /* How much load to actually move to equalise the imbalance */
4793 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4794 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4795 / SCHED_POWER_SCALE;
4796
4797 /*
4798 * if *imbalance is less than the average load per runnable task
4799 * there is no guarantee that any tasks will be moved so we'll have
4800 * a think about bumping its value to force at least one task to be
4801 * moved
4802 */
4803 if (env->imbalance < sds->busiest_load_per_task)
4804 return fix_small_imbalance(env, sds);
4805
4806}
4807
4808/******* find_busiest_group() helpers end here *********************/
4809
4810/**
4811 * find_busiest_group - Returns the busiest group within the sched_domain
4812 * if there is an imbalance. If there isn't an imbalance, and
4813 * the user has opted for power-savings, it returns a group whose
4814 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4815 * such a group exists.
4816 *
4817 * Also calculates the amount of weighted load which should be moved
4818 * to restore balance.
4819 *
4820 * @env: The load balancing environment.
4821 * @balance: Pointer to a variable indicating if this_cpu
4822 * is the appropriate cpu to perform load balancing at this_level.
4823 *
4824 * Returns: - the busiest group if imbalance exists.
4825 * - If no imbalance and user has opted for power-savings balance,
4826 * return the least loaded group whose CPUs can be
4827 * put to idle by rebalancing its tasks onto our group.
4828 */
4829static struct sched_group *
4830find_busiest_group(struct lb_env *env, int *balance)
4831{
4832 struct sd_lb_stats sds;
4833
4834 memset(&sds, 0, sizeof(sds));
4835
4836 /*
4837 * Compute the various statistics relavent for load balancing at
4838 * this level.
4839 */
4840 update_sd_lb_stats(env, balance, &sds);
4841
4842 /*
4843 * this_cpu is not the appropriate cpu to perform load balancing at
4844 * this level.
4845 */
4846 if (!(*balance))
4847 goto ret;
4848
4849 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4850 check_asym_packing(env, &sds))
4851 return sds.busiest;
4852
4853 /* There is no busy sibling group to pull tasks from */
4854 if (!sds.busiest || sds.busiest_nr_running == 0)
4855 goto out_balanced;
4856
4857 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4858
4859 /*
4860 * If the busiest group is imbalanced the below checks don't
4861 * work because they assumes all things are equal, which typically
4862 * isn't true due to cpus_allowed constraints and the like.
4863 */
4864 if (sds.group_imb)
4865 goto force_balance;
4866
4867 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4868 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4869 !sds.busiest_has_capacity)
4870 goto force_balance;
4871
4872 /*
4873 * If the local group is more busy than the selected busiest group
4874 * don't try and pull any tasks.
4875 */
4876 if (sds.this_load >= sds.max_load)
4877 goto out_balanced;
4878
4879 /*
4880 * Don't pull any tasks if this group is already above the domain
4881 * average load.
4882 */
4883 if (sds.this_load >= sds.avg_load)
4884 goto out_balanced;
4885
4886 if (env->idle == CPU_IDLE) {
4887 /*
4888 * This cpu is idle. If the busiest group load doesn't
4889 * have more tasks than the number of available cpu's and
4890 * there is no imbalance between this and busiest group
4891 * wrt to idle cpu's, it is balanced.
4892 */
4893 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4894 sds.busiest_nr_running <= sds.busiest_group_weight)
4895 goto out_balanced;
4896 } else {
4897 /*
4898 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4899 * imbalance_pct to be conservative.
4900 */
4901 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4902 goto out_balanced;
4903 }
4904
4905force_balance:
4906 /* Looks like there is an imbalance. Compute it */
4907 calculate_imbalance(env, &sds);
4908 return sds.busiest;
4909
4910out_balanced:
4911ret:
4912 env->imbalance = 0;
4913 return NULL;
4914}
4915
4916/*
4917 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4918 */
4919static struct rq *find_busiest_queue(struct lb_env *env,
4920 struct sched_group *group)
4921{
4922 struct rq *busiest = NULL, *rq;
4923 unsigned long max_load = 0;
4924 int i;
4925
4926 for_each_cpu(i, sched_group_cpus(group)) {
4927 unsigned long power = power_of(i);
4928 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4929 SCHED_POWER_SCALE);
4930 unsigned long wl;
4931
4932 if (!capacity)
4933 capacity = fix_small_capacity(env->sd, group);
4934
4935 if (!cpumask_test_cpu(i, env->cpus))
4936 continue;
4937
4938 rq = cpu_rq(i);
4939 wl = weighted_cpuload(i);
4940
4941 /*
4942 * When comparing with imbalance, use weighted_cpuload()
4943 * which is not scaled with the cpu power.
4944 */
4945 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4946 continue;
4947
4948 /*
4949 * For the load comparisons with the other cpu's, consider
4950 * the weighted_cpuload() scaled with the cpu power, so that
4951 * the load can be moved away from the cpu that is potentially
4952 * running at a lower capacity.
4953 */
4954 wl = (wl * SCHED_POWER_SCALE) / power;
4955
4956 if (wl > max_load) {
4957 max_load = wl;
4958 busiest = rq;
4959 }
4960 }
4961
4962 return busiest;
4963}
4964
4965/*
4966 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4967 * so long as it is large enough.
4968 */
4969#define MAX_PINNED_INTERVAL 512
4970
4971/* Working cpumask for load_balance and load_balance_newidle. */
4972DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4973
4974static int need_active_balance(struct lb_env *env)
4975{
4976 struct sched_domain *sd = env->sd;
4977
4978 if (env->idle == CPU_NEWLY_IDLE) {
4979
4980 /*
4981 * ASYM_PACKING needs to force migrate tasks from busy but
4982 * higher numbered CPUs in order to pack all tasks in the
4983 * lowest numbered CPUs.
4984 */
4985 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4986 return 1;
4987 }
4988
4989 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4990}
4991
4992static int active_load_balance_cpu_stop(void *data);
4993
4994/*
4995 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4996 * tasks if there is an imbalance.
4997 */
4998static int load_balance(int this_cpu, struct rq *this_rq,
4999 struct sched_domain *sd, enum cpu_idle_type idle,
5000 int *balance)
5001{
5002 int ld_moved, cur_ld_moved, active_balance = 0;
5003 int lb_iterations, max_lb_iterations;
5004 struct sched_group *group;
5005 struct rq *busiest;
5006 unsigned long flags;
5007 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
5008
5009 struct lb_env env = {
5010 .sd = sd,
5011 .dst_cpu = this_cpu,
5012 .dst_rq = this_rq,
5013 .dst_grpmask = sched_group_cpus(sd->groups),
5014 .idle = idle,
5015 .loop_break = sched_nr_migrate_break,
5016 .cpus = cpus,
5017 };
5018
5019 cpumask_copy(cpus, cpu_active_mask);
5020 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5021
5022 schedstat_inc(sd, lb_count[idle]);
5023
5024redo:
5025 group = find_busiest_group(&env, balance);
5026
5027 if (*balance == 0)
5028 goto out_balanced;
5029
5030 if (!group) {
5031 schedstat_inc(sd, lb_nobusyg[idle]);
5032 goto out_balanced;
5033 }
5034
5035 busiest = find_busiest_queue(&env, group);
5036 if (!busiest) {
5037 schedstat_inc(sd, lb_nobusyq[idle]);
5038 goto out_balanced;
5039 }
5040
5041 BUG_ON(busiest == env.dst_rq);
5042
5043 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5044
5045 ld_moved = 0;
5046 lb_iterations = 1;
5047 if (busiest->nr_running > 1) {
5048 /*
5049 * Attempt to move tasks. If find_busiest_group has found
5050 * an imbalance but busiest->nr_running <= 1, the group is
5051 * still unbalanced. ld_moved simply stays zero, so it is
5052 * correctly treated as an imbalance.
5053 */
5054 env.flags |= LBF_ALL_PINNED;
5055 env.src_cpu = busiest->cpu;
5056 env.src_rq = busiest;
5057 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5058
5059 update_h_load(env.src_cpu);
5060more_balance:
5061 local_irq_save(flags);
5062 double_rq_lock(env.dst_rq, busiest);
5063
5064 /*
5065 * cur_ld_moved - load moved in current iteration
5066 * ld_moved - cumulative load moved across iterations
5067 */
5068 cur_ld_moved = move_tasks(&env);
5069 ld_moved += cur_ld_moved;
5070 double_rq_unlock(env.dst_rq, busiest);
5071 local_irq_restore(flags);
5072
5073 if (env.flags & LBF_NEED_BREAK) {
5074 env.flags &= ~LBF_NEED_BREAK;
5075 goto more_balance;
5076 }
5077
5078 /*
5079 * some other cpu did the load balance for us.
5080 */
5081 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5082 resched_cpu(env.dst_cpu);
5083
5084 /*
5085 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5086 * us and move them to an alternate dst_cpu in our sched_group
5087 * where they can run. The upper limit on how many times we
5088 * iterate on same src_cpu is dependent on number of cpus in our
5089 * sched_group.
5090 *
5091 * This changes load balance semantics a bit on who can move
5092 * load to a given_cpu. In addition to the given_cpu itself
5093 * (or a ilb_cpu acting on its behalf where given_cpu is
5094 * nohz-idle), we now have balance_cpu in a position to move
5095 * load to given_cpu. In rare situations, this may cause
5096 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5097 * _independently_ and at _same_ time to move some load to
5098 * given_cpu) causing exceess load to be moved to given_cpu.
5099 * This however should not happen so much in practice and
5100 * moreover subsequent load balance cycles should correct the
5101 * excess load moved.
5102 */
5103 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
5104 lb_iterations++ < max_lb_iterations) {
5105
5106 env.dst_rq = cpu_rq(env.new_dst_cpu);
5107 env.dst_cpu = env.new_dst_cpu;
5108 env.flags &= ~LBF_SOME_PINNED;
5109 env.loop = 0;
5110 env.loop_break = sched_nr_migrate_break;
5111 /*
5112 * Go back to "more_balance" rather than "redo" since we
5113 * need to continue with same src_cpu.
5114 */
5115 goto more_balance;
5116 }
5117
5118 /* All tasks on this runqueue were pinned by CPU affinity */
5119 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5120 cpumask_clear_cpu(cpu_of(busiest), cpus);
5121 if (!cpumask_empty(cpus)) {
5122 env.loop = 0;
5123 env.loop_break = sched_nr_migrate_break;
5124 goto redo;
5125 }
5126 goto out_balanced;
5127 }
5128 }
5129
5130 if (!ld_moved) {
5131 schedstat_inc(sd, lb_failed[idle]);
5132 /*
5133 * Increment the failure counter only on periodic balance.
5134 * We do not want newidle balance, which can be very
5135 * frequent, pollute the failure counter causing
5136 * excessive cache_hot migrations and active balances.
5137 */
5138 if (idle != CPU_NEWLY_IDLE)
5139 sd->nr_balance_failed++;
5140
5141 if (need_active_balance(&env)) {
5142 raw_spin_lock_irqsave(&busiest->lock, flags);
5143
5144 /* don't kick the active_load_balance_cpu_stop,
5145 * if the curr task on busiest cpu can't be
5146 * moved to this_cpu
5147 */
5148 if (!cpumask_test_cpu(this_cpu,
5149 tsk_cpus_allowed(busiest->curr))) {
5150 raw_spin_unlock_irqrestore(&busiest->lock,
5151 flags);
5152 env.flags |= LBF_ALL_PINNED;
5153 goto out_one_pinned;
5154 }
5155
5156 /*
5157 * ->active_balance synchronizes accesses to
5158 * ->active_balance_work. Once set, it's cleared
5159 * only after active load balance is finished.
5160 */
5161 if (!busiest->active_balance) {
5162 busiest->active_balance = 1;
5163 busiest->push_cpu = this_cpu;
5164 active_balance = 1;
5165 }
5166 raw_spin_unlock_irqrestore(&busiest->lock, flags);
5167
5168 if (active_balance) {
5169 stop_one_cpu_nowait(cpu_of(busiest),
5170 active_load_balance_cpu_stop, busiest,
5171 &busiest->active_balance_work);
5172 }
5173
5174 /*
5175 * We've kicked active balancing, reset the failure
5176 * counter.
5177 */
5178 sd->nr_balance_failed = sd->cache_nice_tries+1;
5179 }
5180 } else
5181 sd->nr_balance_failed = 0;
5182
5183 if (likely(!active_balance)) {
5184 /* We were unbalanced, so reset the balancing interval */
5185 sd->balance_interval = sd->min_interval;
5186 } else {
5187 /*
5188 * If we've begun active balancing, start to back off. This
5189 * case may not be covered by the all_pinned logic if there
5190 * is only 1 task on the busy runqueue (because we don't call
5191 * move_tasks).
5192 */
5193 if (sd->balance_interval < sd->max_interval)
5194 sd->balance_interval *= 2;
5195 }
5196
5197 goto out;
5198
5199out_balanced:
5200 schedstat_inc(sd, lb_balanced[idle]);
5201
5202 sd->nr_balance_failed = 0;
5203
5204out_one_pinned:
5205 /* tune up the balancing interval */
5206 if (((env.flags & LBF_ALL_PINNED) &&
5207 sd->balance_interval < MAX_PINNED_INTERVAL) ||
5208 (sd->balance_interval < sd->max_interval))
5209 sd->balance_interval *= 2;
5210
5211 ld_moved = 0;
5212out:
5213 return ld_moved;
5214}
5215
5216/*
5217 * idle_balance is called by schedule() if this_cpu is about to become
5218 * idle. Attempts to pull tasks from other CPUs.
5219 */
5220void idle_balance(int this_cpu, struct rq *this_rq)
5221{
5222 struct sched_domain *sd;
5223 int pulled_task = 0;
5224 unsigned long next_balance = jiffies + HZ;
5225
5226 this_rq->idle_stamp = this_rq->clock;
5227
5228 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5229 return;
5230
5231 update_rq_runnable_avg(this_rq, 1);
5232
5233 /*
5234 * Drop the rq->lock, but keep IRQ/preempt disabled.
5235 */
5236 raw_spin_unlock(&this_rq->lock);
5237
5238 update_blocked_averages(this_cpu);
5239 rcu_read_lock();
5240 for_each_domain(this_cpu, sd) {
5241 unsigned long interval;
5242 int balance = 1;
5243
5244 if (!(sd->flags & SD_LOAD_BALANCE))
5245 continue;
5246
5247 if (sd->flags & SD_BALANCE_NEWIDLE) {
5248 /* If we've pulled tasks over stop searching: */
5249 pulled_task = load_balance(this_cpu, this_rq,
5250 sd, CPU_NEWLY_IDLE, &balance);
5251 }
5252
5253 interval = msecs_to_jiffies(sd->balance_interval);
5254 if (time_after(next_balance, sd->last_balance + interval))
5255 next_balance = sd->last_balance + interval;
5256 if (pulled_task) {
5257 this_rq->idle_stamp = 0;
5258 break;
5259 }
5260 }
5261 rcu_read_unlock();
5262
5263 raw_spin_lock(&this_rq->lock);
5264
5265 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5266 /*
5267 * We are going idle. next_balance may be set based on
5268 * a busy processor. So reset next_balance.
5269 */
5270 this_rq->next_balance = next_balance;
5271 }
5272}
5273
5274/*
5275 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5276 * running tasks off the busiest CPU onto idle CPUs. It requires at
5277 * least 1 task to be running on each physical CPU where possible, and
5278 * avoids physical / logical imbalances.
5279 */
5280static int active_load_balance_cpu_stop(void *data)
5281{
5282 struct rq *busiest_rq = data;
5283 int busiest_cpu = cpu_of(busiest_rq);
5284 int target_cpu = busiest_rq->push_cpu;
5285 struct rq *target_rq = cpu_rq(target_cpu);
5286 struct sched_domain *sd;
5287
5288 raw_spin_lock_irq(&busiest_rq->lock);
5289
5290 /* make sure the requested cpu hasn't gone down in the meantime */
5291 if (unlikely(busiest_cpu != smp_processor_id() ||
5292 !busiest_rq->active_balance))
5293 goto out_unlock;
5294
5295 /* Is there any task to move? */
5296 if (busiest_rq->nr_running <= 1)
5297 goto out_unlock;
5298
5299 /*
5300 * This condition is "impossible", if it occurs
5301 * we need to fix it. Originally reported by
5302 * Bjorn Helgaas on a 128-cpu setup.
5303 */
5304 BUG_ON(busiest_rq == target_rq);
5305
5306 /* move a task from busiest_rq to target_rq */
5307 double_lock_balance(busiest_rq, target_rq);
5308
5309 /* Search for an sd spanning us and the target CPU. */
5310 rcu_read_lock();
5311 for_each_domain(target_cpu, sd) {
5312 if ((sd->flags & SD_LOAD_BALANCE) &&
5313 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5314 break;
5315 }
5316
5317 if (likely(sd)) {
5318 struct lb_env env = {
5319 .sd = sd,
5320 .dst_cpu = target_cpu,
5321 .dst_rq = target_rq,
5322 .src_cpu = busiest_rq->cpu,
5323 .src_rq = busiest_rq,
5324 .idle = CPU_IDLE,
5325 };
5326
5327 schedstat_inc(sd, alb_count);
5328
5329 if (move_one_task(&env))
5330 schedstat_inc(sd, alb_pushed);
5331 else
5332 schedstat_inc(sd, alb_failed);
5333 }
5334 rcu_read_unlock();
5335 double_unlock_balance(busiest_rq, target_rq);
5336out_unlock:
5337 busiest_rq->active_balance = 0;
5338 raw_spin_unlock_irq(&busiest_rq->lock);
5339 return 0;
5340}
5341
5342#ifdef CONFIG_NO_HZ
5343/*
5344 * idle load balancing details
5345 * - When one of the busy CPUs notice that there may be an idle rebalancing
5346 * needed, they will kick the idle load balancer, which then does idle
5347 * load balancing for all the idle CPUs.
5348 */
5349static struct {
5350 cpumask_var_t idle_cpus_mask;
5351 atomic_t nr_cpus;
5352 unsigned long next_balance; /* in jiffy units */
5353} nohz ____cacheline_aligned;
5354
5355static inline int find_new_ilb(int call_cpu)
5356{
5357 int ilb = cpumask_first(nohz.idle_cpus_mask);
5358
5359 if (ilb < nr_cpu_ids && idle_cpu(ilb))
5360 return ilb;
5361
5362 return nr_cpu_ids;
5363}
5364
5365/*
5366 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5367 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5368 * CPU (if there is one).
5369 */
5370static void nohz_balancer_kick(int cpu)
5371{
5372 int ilb_cpu;
5373
5374 nohz.next_balance++;
5375
5376 ilb_cpu = find_new_ilb(cpu);
5377
5378 if (ilb_cpu >= nr_cpu_ids)
5379 return;
5380
5381 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5382 return;
5383 /*
5384 * Use smp_send_reschedule() instead of resched_cpu().
5385 * This way we generate a sched IPI on the target cpu which
5386 * is idle. And the softirq performing nohz idle load balance
5387 * will be run before returning from the IPI.
5388 */
5389 smp_send_reschedule(ilb_cpu);
5390 return;
5391}
5392
5393static inline void nohz_balance_exit_idle(int cpu)
5394{
5395 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5396 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5397 atomic_dec(&nohz.nr_cpus);
5398 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5399 }
5400}
5401
5402static inline void set_cpu_sd_state_busy(void)
5403{
5404 struct sched_domain *sd;
5405 int cpu = smp_processor_id();
5406
5407 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5408 return;
5409 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5410
5411 rcu_read_lock();
5412 for_each_domain(cpu, sd)
5413 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5414 rcu_read_unlock();
5415}
5416
5417void set_cpu_sd_state_idle(void)
5418{
5419 struct sched_domain *sd;
5420 int cpu = smp_processor_id();
5421
5422 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5423 return;
5424 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5425
5426 rcu_read_lock();
5427 for_each_domain(cpu, sd)
5428 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5429 rcu_read_unlock();
5430}
5431
5432/*
5433 * This routine will record that the cpu is going idle with tick stopped.
5434 * This info will be used in performing idle load balancing in the future.
5435 */
5436void nohz_balance_enter_idle(int cpu)
5437{
5438 /*
5439 * If this cpu is going down, then nothing needs to be done.
5440 */
5441 if (!cpu_active(cpu))
5442 return;
5443
5444 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5445 return;
5446
5447 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5448 atomic_inc(&nohz.nr_cpus);
5449 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5450}
5451
5452static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5453 unsigned long action, void *hcpu)
5454{
5455 switch (action & ~CPU_TASKS_FROZEN) {
5456 case CPU_DYING:
5457 nohz_balance_exit_idle(smp_processor_id());
5458 return NOTIFY_OK;
5459 default:
5460 return NOTIFY_DONE;
5461 }
5462}
5463#endif
5464
5465static DEFINE_SPINLOCK(balancing);
5466
5467/*
5468 * Scale the max load_balance interval with the number of CPUs in the system.
5469 * This trades load-balance latency on larger machines for less cross talk.
5470 */
5471void update_max_interval(void)
5472{
5473 max_load_balance_interval = HZ*num_online_cpus()/10;
5474}
5475
5476/*
5477 * It checks each scheduling domain to see if it is due to be balanced,
5478 * and initiates a balancing operation if so.
5479 *
5480 * Balancing parameters are set up in arch_init_sched_domains.
5481 */
5482static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5483{
5484 int balance = 1;
5485 struct rq *rq = cpu_rq(cpu);
5486 unsigned long interval;
5487 struct sched_domain *sd;
5488 /* Earliest time when we have to do rebalance again */
5489 unsigned long next_balance = jiffies + 60*HZ;
5490 int update_next_balance = 0;
5491 int need_serialize;
5492
5493 update_blocked_averages(cpu);
5494
5495 rcu_read_lock();
5496 for_each_domain(cpu, sd) {
5497 if (!(sd->flags & SD_LOAD_BALANCE))
5498 continue;
5499
5500 interval = sd->balance_interval;
5501 if (idle != CPU_IDLE)
5502 interval *= sd->busy_factor;
5503
5504 /* scale ms to jiffies */
5505 interval = msecs_to_jiffies(interval);
5506 interval = clamp(interval, 1UL, max_load_balance_interval);
5507
5508 need_serialize = sd->flags & SD_SERIALIZE;
5509
5510 if (need_serialize) {
5511 if (!spin_trylock(&balancing))
5512 goto out;
5513 }
5514
5515 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5516 if (load_balance(cpu, rq, sd, idle, &balance)) {
5517 /*
5518 * We've pulled tasks over so either we're no
5519 * longer idle.
5520 */
5521 idle = CPU_NOT_IDLE;
5522 }
5523 sd->last_balance = jiffies;
5524 }
5525 if (need_serialize)
5526 spin_unlock(&balancing);
5527out:
5528 if (time_after(next_balance, sd->last_balance + interval)) {
5529 next_balance = sd->last_balance + interval;
5530 update_next_balance = 1;
5531 }
5532
5533 /*
5534 * Stop the load balance at this level. There is another
5535 * CPU in our sched group which is doing load balancing more
5536 * actively.
5537 */
5538 if (!balance)
5539 break;
5540 }
5541 rcu_read_unlock();
5542
5543 /*
5544 * next_balance will be updated only when there is a need.
5545 * When the cpu is attached to null domain for ex, it will not be
5546 * updated.
5547 */
5548 if (likely(update_next_balance))
5549 rq->next_balance = next_balance;
5550}
5551
5552#ifdef CONFIG_NO_HZ
5553/*
5554 * In CONFIG_NO_HZ case, the idle balance kickee will do the
5555 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5556 */
5557static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5558{
5559 struct rq *this_rq = cpu_rq(this_cpu);
5560 struct rq *rq;
5561 int balance_cpu;
5562
5563 if (idle != CPU_IDLE ||
5564 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5565 goto end;
5566
5567 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5568 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5569 continue;
5570
5571 /*
5572 * If this cpu gets work to do, stop the load balancing
5573 * work being done for other cpus. Next load
5574 * balancing owner will pick it up.
5575 */
5576 if (need_resched())
5577 break;
5578
5579 rq = cpu_rq(balance_cpu);
5580
5581 raw_spin_lock_irq(&rq->lock);
5582 update_rq_clock(rq);
5583 update_idle_cpu_load(rq);
5584 raw_spin_unlock_irq(&rq->lock);
5585
5586 rebalance_domains(balance_cpu, CPU_IDLE);
5587
5588 if (time_after(this_rq->next_balance, rq->next_balance))
5589 this_rq->next_balance = rq->next_balance;
5590 }
5591 nohz.next_balance = this_rq->next_balance;
5592end:
5593 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5594}
5595
5596/*
5597 * Current heuristic for kicking the idle load balancer in the presence
5598 * of an idle cpu is the system.
5599 * - This rq has more than one task.
5600 * - At any scheduler domain level, this cpu's scheduler group has multiple
5601 * busy cpu's exceeding the group's power.
5602 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5603 * domain span are idle.
5604 */
5605static inline int nohz_kick_needed(struct rq *rq, int cpu)
5606{
5607 unsigned long now = jiffies;
5608 struct sched_domain *sd;
5609
5610 if (unlikely(idle_cpu(cpu)))
5611 return 0;
5612
5613 /*
5614 * We may be recently in ticked or tickless idle mode. At the first
5615 * busy tick after returning from idle, we will update the busy stats.
5616 */
5617 set_cpu_sd_state_busy();
5618 nohz_balance_exit_idle(cpu);
5619
5620 /*
5621 * None are in tickless mode and hence no need for NOHZ idle load
5622 * balancing.
5623 */
5624 if (likely(!atomic_read(&nohz.nr_cpus)))
5625 return 0;
5626
5627 if (time_before(now, nohz.next_balance))
5628 return 0;
5629
5630 if (rq->nr_running >= 2)
5631 goto need_kick;
5632
5633 rcu_read_lock();
5634 for_each_domain(cpu, sd) {
5635 struct sched_group *sg = sd->groups;
5636 struct sched_group_power *sgp = sg->sgp;
5637 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5638
5639 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5640 goto need_kick_unlock;
5641
5642 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5643 && (cpumask_first_and(nohz.idle_cpus_mask,
5644 sched_domain_span(sd)) < cpu))
5645 goto need_kick_unlock;
5646
5647 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5648 break;
5649 }
5650 rcu_read_unlock();
5651 return 0;
5652
5653need_kick_unlock:
5654 rcu_read_unlock();
5655need_kick:
5656 return 1;
5657}
5658#else
5659static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5660#endif
5661
5662/*
5663 * run_rebalance_domains is triggered when needed from the scheduler tick.
5664 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5665 */
5666static void run_rebalance_domains(struct softirq_action *h)
5667{
5668 int this_cpu = smp_processor_id();
5669 struct rq *this_rq = cpu_rq(this_cpu);
5670 enum cpu_idle_type idle = this_rq->idle_balance ?
5671 CPU_IDLE : CPU_NOT_IDLE;
5672
5673 rebalance_domains(this_cpu, idle);
5674
5675 /*
5676 * If this cpu has a pending nohz_balance_kick, then do the
5677 * balancing on behalf of the other idle cpus whose ticks are
5678 * stopped.
5679 */
5680 nohz_idle_balance(this_cpu, idle);
5681}
5682
5683static inline int on_null_domain(int cpu)
5684{
5685 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5686}
5687
5688/*
5689 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5690 */
5691void trigger_load_balance(struct rq *rq, int cpu)
5692{
5693 /* Don't need to rebalance while attached to NULL domain */
5694 if (time_after_eq(jiffies, rq->next_balance) &&
5695 likely(!on_null_domain(cpu)))
5696 raise_softirq(SCHED_SOFTIRQ);
5697#ifdef CONFIG_NO_HZ
5698 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5699 nohz_balancer_kick(cpu);
5700#endif
5701}
5702
5703static void rq_online_fair(struct rq *rq)
5704{
5705 update_sysctl();
5706}
5707
5708static void rq_offline_fair(struct rq *rq)
5709{
5710 update_sysctl();
5711
5712 /* Ensure any throttled groups are reachable by pick_next_task */
5713 unthrottle_offline_cfs_rqs(rq);
5714}
5715
5716#endif /* CONFIG_SMP */
5717
5718/*
5719 * scheduler tick hitting a task of our scheduling class:
5720 */
5721static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5722{
5723 struct cfs_rq *cfs_rq;
5724 struct sched_entity *se = &curr->se;
5725
5726 for_each_sched_entity(se) {
5727 cfs_rq = cfs_rq_of(se);
5728 entity_tick(cfs_rq, se, queued);
5729 }
5730
5731 if (sched_feat_numa(NUMA))
5732 task_tick_numa(rq, curr);
5733
5734 update_rq_runnable_avg(rq, 1);
5735}
5736
5737/*
5738 * called on fork with the child task as argument from the parent's context
5739 * - child not yet on the tasklist
5740 * - preemption disabled
5741 */
5742static void task_fork_fair(struct task_struct *p)
5743{
5744 struct cfs_rq *cfs_rq;
5745 struct sched_entity *se = &p->se, *curr;
5746 int this_cpu = smp_processor_id();
5747 struct rq *rq = this_rq();
5748 unsigned long flags;
5749
5750 raw_spin_lock_irqsave(&rq->lock, flags);
5751
5752 update_rq_clock(rq);
5753
5754 cfs_rq = task_cfs_rq(current);
5755 curr = cfs_rq->curr;
5756
5757 if (unlikely(task_cpu(p) != this_cpu)) {
5758 rcu_read_lock();
5759 __set_task_cpu(p, this_cpu);
5760 rcu_read_unlock();
5761 }
5762
5763 update_curr(cfs_rq);
5764
5765 if (curr)
5766 se->vruntime = curr->vruntime;
5767 place_entity(cfs_rq, se, 1);
5768
5769 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5770 /*
5771 * Upon rescheduling, sched_class::put_prev_task() will place
5772 * 'current' within the tree based on its new key value.
5773 */
5774 swap(curr->vruntime, se->vruntime);
5775 resched_task(rq->curr);
5776 }
5777
5778 se->vruntime -= cfs_rq->min_vruntime;
5779
5780 raw_spin_unlock_irqrestore(&rq->lock, flags);
5781}
5782
5783/*
5784 * Priority of the task has changed. Check to see if we preempt
5785 * the current task.
5786 */
5787static void
5788prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5789{
5790 if (!p->se.on_rq)
5791 return;
5792
5793 /*
5794 * Reschedule if we are currently running on this runqueue and
5795 * our priority decreased, or if we are not currently running on
5796 * this runqueue and our priority is higher than the current's
5797 */
5798 if (rq->curr == p) {
5799 if (p->prio > oldprio)
5800 resched_task(rq->curr);
5801 } else
5802 check_preempt_curr(rq, p, 0);
5803}
5804
5805static void switched_from_fair(struct rq *rq, struct task_struct *p)
5806{
5807 struct sched_entity *se = &p->se;
5808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5809
5810 /*
5811 * Ensure the task's vruntime is normalized, so that when its
5812 * switched back to the fair class the enqueue_entity(.flags=0) will
5813 * do the right thing.
5814 *
5815 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5816 * have normalized the vruntime, if it was !on_rq, then only when
5817 * the task is sleeping will it still have non-normalized vruntime.
5818 */
5819 if (!se->on_rq && p->state != TASK_RUNNING) {
5820 /*
5821 * Fix up our vruntime so that the current sleep doesn't
5822 * cause 'unlimited' sleep bonus.
5823 */
5824 place_entity(cfs_rq, se, 0);
5825 se->vruntime -= cfs_rq->min_vruntime;
5826 }
5827
5828#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5829 /*
5830 * Remove our load from contribution when we leave sched_fair
5831 * and ensure we don't carry in an old decay_count if we
5832 * switch back.
5833 */
5834 if (p->se.avg.decay_count) {
5835 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5836 __synchronize_entity_decay(&p->se);
5837 subtract_blocked_load_contrib(cfs_rq,
5838 p->se.avg.load_avg_contrib);
5839 }
5840#endif
5841}
5842
5843/*
5844 * We switched to the sched_fair class.
5845 */
5846static void switched_to_fair(struct rq *rq, struct task_struct *p)
5847{
5848 if (!p->se.on_rq)
5849 return;
5850
5851 /*
5852 * We were most likely switched from sched_rt, so
5853 * kick off the schedule if running, otherwise just see
5854 * if we can still preempt the current task.
5855 */
5856 if (rq->curr == p)
5857 resched_task(rq->curr);
5858 else
5859 check_preempt_curr(rq, p, 0);
5860}
5861
5862/* Account for a task changing its policy or group.
5863 *
5864 * This routine is mostly called to set cfs_rq->curr field when a task
5865 * migrates between groups/classes.
5866 */
5867static void set_curr_task_fair(struct rq *rq)
5868{
5869 struct sched_entity *se = &rq->curr->se;
5870
5871 for_each_sched_entity(se) {
5872 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5873
5874 set_next_entity(cfs_rq, se);
5875 /* ensure bandwidth has been allocated on our new cfs_rq */
5876 account_cfs_rq_runtime(cfs_rq, 0);
5877 }
5878}
5879
5880void init_cfs_rq(struct cfs_rq *cfs_rq)
5881{
5882 cfs_rq->tasks_timeline = RB_ROOT;
5883 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5884#ifndef CONFIG_64BIT
5885 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5886#endif
5887#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5888 atomic64_set(&cfs_rq->decay_counter, 1);
5889 atomic64_set(&cfs_rq->removed_load, 0);
5890#endif
5891}
5892
5893#ifdef CONFIG_FAIR_GROUP_SCHED
5894static void task_move_group_fair(struct task_struct *p, int on_rq)
5895{
5896 struct cfs_rq *cfs_rq;
5897 /*
5898 * If the task was not on the rq at the time of this cgroup movement
5899 * it must have been asleep, sleeping tasks keep their ->vruntime
5900 * absolute on their old rq until wakeup (needed for the fair sleeper
5901 * bonus in place_entity()).
5902 *
5903 * If it was on the rq, we've just 'preempted' it, which does convert
5904 * ->vruntime to a relative base.
5905 *
5906 * Make sure both cases convert their relative position when migrating
5907 * to another cgroup's rq. This does somewhat interfere with the
5908 * fair sleeper stuff for the first placement, but who cares.
5909 */
5910 /*
5911 * When !on_rq, vruntime of the task has usually NOT been normalized.
5912 * But there are some cases where it has already been normalized:
5913 *
5914 * - Moving a forked child which is waiting for being woken up by
5915 * wake_up_new_task().
5916 * - Moving a task which has been woken up by try_to_wake_up() and
5917 * waiting for actually being woken up by sched_ttwu_pending().
5918 *
5919 * To prevent boost or penalty in the new cfs_rq caused by delta
5920 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5921 */
5922 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5923 on_rq = 1;
5924
5925 if (!on_rq)
5926 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5927 set_task_rq(p, task_cpu(p));
5928 if (!on_rq) {
5929 cfs_rq = cfs_rq_of(&p->se);
5930 p->se.vruntime += cfs_rq->min_vruntime;
5931#ifdef CONFIG_SMP
5932 /*
5933 * migrate_task_rq_fair() will have removed our previous
5934 * contribution, but we must synchronize for ongoing future
5935 * decay.
5936 */
5937 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5938 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5939#endif
5940 }
5941}
5942
5943void free_fair_sched_group(struct task_group *tg)
5944{
5945 int i;
5946
5947 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5948
5949 for_each_possible_cpu(i) {
5950 if (tg->cfs_rq)
5951 kfree(tg->cfs_rq[i]);
5952 if (tg->se)
5953 kfree(tg->se[i]);
5954 }
5955
5956 kfree(tg->cfs_rq);
5957 kfree(tg->se);
5958}
5959
5960int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5961{
5962 struct cfs_rq *cfs_rq;
5963 struct sched_entity *se;
5964 int i;
5965
5966 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5967 if (!tg->cfs_rq)
5968 goto err;
5969 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5970 if (!tg->se)
5971 goto err;
5972
5973 tg->shares = NICE_0_LOAD;
5974
5975 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5976
5977 for_each_possible_cpu(i) {
5978 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5979 GFP_KERNEL, cpu_to_node(i));
5980 if (!cfs_rq)
5981 goto err;
5982
5983 se = kzalloc_node(sizeof(struct sched_entity),
5984 GFP_KERNEL, cpu_to_node(i));
5985 if (!se)
5986 goto err_free_rq;
5987
5988 init_cfs_rq(cfs_rq);
5989 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5990 }
5991
5992 return 1;
5993
5994err_free_rq:
5995 kfree(cfs_rq);
5996err:
5997 return 0;
5998}
5999
6000void unregister_fair_sched_group(struct task_group *tg, int cpu)
6001{
6002 struct rq *rq = cpu_rq(cpu);
6003 unsigned long flags;
6004
6005 /*
6006 * Only empty task groups can be destroyed; so we can speculatively
6007 * check on_list without danger of it being re-added.
6008 */
6009 if (!tg->cfs_rq[cpu]->on_list)
6010 return;
6011
6012 raw_spin_lock_irqsave(&rq->lock, flags);
6013 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6014 raw_spin_unlock_irqrestore(&rq->lock, flags);
6015}
6016
6017void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6018 struct sched_entity *se, int cpu,
6019 struct sched_entity *parent)
6020{
6021 struct rq *rq = cpu_rq(cpu);
6022
6023 cfs_rq->tg = tg;
6024 cfs_rq->rq = rq;
6025 init_cfs_rq_runtime(cfs_rq);
6026
6027 tg->cfs_rq[cpu] = cfs_rq;
6028 tg->se[cpu] = se;
6029
6030 /* se could be NULL for root_task_group */
6031 if (!se)
6032 return;
6033
6034 if (!parent)
6035 se->cfs_rq = &rq->cfs;
6036 else
6037 se->cfs_rq = parent->my_q;
6038
6039 se->my_q = cfs_rq;
6040 update_load_set(&se->load, 0);
6041 se->parent = parent;
6042}
6043
6044static DEFINE_MUTEX(shares_mutex);
6045
6046int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6047{
6048 int i;
6049 unsigned long flags;
6050
6051 /*
6052 * We can't change the weight of the root cgroup.
6053 */
6054 if (!tg->se[0])
6055 return -EINVAL;
6056
6057 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6058
6059 mutex_lock(&shares_mutex);
6060 if (tg->shares == shares)
6061 goto done;
6062
6063 tg->shares = shares;
6064 for_each_possible_cpu(i) {
6065 struct rq *rq = cpu_rq(i);
6066 struct sched_entity *se;
6067
6068 se = tg->se[i];
6069 /* Propagate contribution to hierarchy */
6070 raw_spin_lock_irqsave(&rq->lock, flags);
6071 for_each_sched_entity(se)
6072 update_cfs_shares(group_cfs_rq(se));
6073 raw_spin_unlock_irqrestore(&rq->lock, flags);
6074 }
6075
6076done:
6077 mutex_unlock(&shares_mutex);
6078 return 0;
6079}
6080#else /* CONFIG_FAIR_GROUP_SCHED */
6081
6082void free_fair_sched_group(struct task_group *tg) { }
6083
6084int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6085{
6086 return 1;
6087}
6088
6089void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6090
6091#endif /* CONFIG_FAIR_GROUP_SCHED */
6092
6093
6094static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6095{
6096 struct sched_entity *se = &task->se;
6097 unsigned int rr_interval = 0;
6098
6099 /*
6100 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6101 * idle runqueue:
6102 */
6103 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6105
6106 return rr_interval;
6107}
6108
6109/*
6110 * All the scheduling class methods:
6111 */
6112const struct sched_class fair_sched_class = {
6113 .next = &idle_sched_class,
6114 .enqueue_task = enqueue_task_fair,
6115 .dequeue_task = dequeue_task_fair,
6116 .yield_task = yield_task_fair,
6117 .yield_to_task = yield_to_task_fair,
6118
6119 .check_preempt_curr = check_preempt_wakeup,
6120
6121 .pick_next_task = pick_next_task_fair,
6122 .put_prev_task = put_prev_task_fair,
6123
6124#ifdef CONFIG_SMP
6125 .select_task_rq = select_task_rq_fair,
6126#ifdef CONFIG_FAIR_GROUP_SCHED
6127 .migrate_task_rq = migrate_task_rq_fair,
6128#endif
6129 .rq_online = rq_online_fair,
6130 .rq_offline = rq_offline_fair,
6131
6132 .task_waking = task_waking_fair,
6133#endif
6134
6135 .set_curr_task = set_curr_task_fair,
6136 .task_tick = task_tick_fair,
6137 .task_fork = task_fork_fair,
6138
6139 .prio_changed = prio_changed_fair,
6140 .switched_from = switched_from_fair,
6141 .switched_to = switched_to_fair,
6142
6143 .get_rr_interval = get_rr_interval_fair,
6144
6145#ifdef CONFIG_FAIR_GROUP_SCHED
6146 .task_move_group = task_move_group_fair,
6147#endif
6148};
6149
6150#ifdef CONFIG_SCHED_DEBUG
6151void print_cfs_stats(struct seq_file *m, int cpu)
6152{
6153 struct cfs_rq *cfs_rq;
6154
6155 rcu_read_lock();
6156 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6157 print_cfs_rq(m, cpu, cfs_rq);
6158 rcu_read_unlock();
6159}
6160#endif
6161
6162__init void init_sched_fair_class(void)
6163{
6164#ifdef CONFIG_SMP
6165 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6166
6167#ifdef CONFIG_NO_HZ
6168 nohz.next_balance = jiffies;
6169 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6170 cpu_notifier(sched_ilb_notifier, 0);
6171#endif
6172#endif /* SMP */
6173
6174}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
deleted file mode 100644
index 1ad1d2b5395..00000000000
--- a/kernel/sched/features.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * Only give sleepers 50% of their service deficit. This allows
3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart.
5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7
8/*
9 * Place new tasks ahead so that they do not starve already running
10 * tasks
11 */
12SCHED_FEAT(START_DEBIT, true)
13
14/*
15 * Prefer to schedule the task we woke last (assuming it failed
16 * wakeup-preemption), since its likely going to consume data we
17 * touched, increases cache locality.
18 */
19SCHED_FEAT(NEXT_BUDDY, false)
20
21/*
22 * Prefer to schedule the task that ran last (when we did
23 * wake-preempt) as that likely will touch the same data, increases
24 * cache locality.
25 */
26SCHED_FEAT(LAST_BUDDY, true)
27
28/*
29 * Consider buddies to be cache hot, decreases the likelyness of a
30 * cache buddy being migrated away, increases cache locality.
31 */
32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33
34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
40 * Use arch dependent cpu power functions
41 */
42SCHED_FEAT(ARCH_POWER, true)
43
44SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true)
47
48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks
57 */
58SCHED_FEAT(NONTASK_POWER, true)
59
60/*
61 * Queue remote wakeups on the target CPU and process them
62 * using the scheduler IPI. Reduces rq->lock contention/bounces.
63 */
64SCHED_FEAT(TTWU_QUEUE, true)
65
66SCHED_FEAT(FORCE_SD_OVERLAP, false)
67SCHED_FEAT(RT_RUNTIME_SHARE, true)
68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index b6baf370cae..00000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,98 +0,0 @@
1#include "sched.h"
2
3/*
4 * idle-task scheduling class.
5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched/fair.c)
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{
14 return task_cpu(p); /* IDLE tasks as never migrated */
15}
16#endif /* CONFIG_SMP */
17/*
18 * Idle tasks are unconditionally rescheduled:
19 */
20static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->idle);
23}
24
25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{
27 schedstat_inc(rq, sched_goidle);
28 return rq->idle;
29}
30
31/*
32 * It is not legal to sleep in the idle task - print a warning
33 * message if some code attempts to do it:
34 */
35static void
36dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
37{
38 raw_spin_unlock_irq(&rq->lock);
39 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
40 dump_stack();
41 raw_spin_lock_irq(&rq->lock);
42}
43
44static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
45{
46}
47
48static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
49{
50}
51
52static void set_curr_task_idle(struct rq *rq)
53{
54}
55
56static void switched_to_idle(struct rq *rq, struct task_struct *p)
57{
58 BUG();
59}
60
61static void
62prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
63{
64 BUG();
65}
66
67static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
68{
69 return 0;
70}
71
72/*
73 * Simple, special scheduling class for the per-CPU idle tasks:
74 */
75const struct sched_class idle_sched_class = {
76 /* .next is NULL */
77 /* no enqueue/yield_task for idle tasks */
78
79 /* dequeue is not valid, we print a debug message there: */
80 .dequeue_task = dequeue_task_idle,
81
82 .check_preempt_curr = check_preempt_curr_idle,
83
84 .pick_next_task = pick_next_task_idle,
85 .put_prev_task = put_prev_task_idle,
86
87#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle,
89#endif
90
91 .set_curr_task = set_curr_task_idle,
92 .task_tick = task_tick_idle,
93
94 .get_rr_interval = get_rr_interval_idle,
95
96 .prio_changed = prio_changed_idle,
97 .switched_to = switched_to_idle,
98};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
deleted file mode 100644
index 418feb01344..00000000000
--- a/kernel/sched/rt.c
+++ /dev/null
@@ -1,2094 +0,0 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
92
93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
94
95static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
96{
97#ifdef CONFIG_SCHED_DEBUG
98 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
99#endif
100 return container_of(rt_se, struct task_struct, rt);
101}
102
103static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
104{
105 return rt_rq->rq;
106}
107
108static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
109{
110 return rt_se->rt_rq;
111}
112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
198#else /* CONFIG_RT_GROUP_SCHED */
199
200#define rt_entity_is_task(rt_se) (1)
201
202static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
203{
204 return container_of(rt_se, struct task_struct, rt);
205}
206
207static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
208{
209 return container_of(rt_rq, struct rq, rt);
210}
211
212static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
213{
214 struct task_struct *p = rt_task_of(rt_se);
215 struct rq *rq = task_rq(p);
216
217 return &rq->rt;
218}
219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
226#endif /* CONFIG_RT_GROUP_SCHED */
227
228#ifdef CONFIG_SMP
229
230static inline int rt_overloaded(struct rq *rq)
231{
232 return atomic_read(&rq->rd->rto_count);
233}
234
235static inline void rt_set_overload(struct rq *rq)
236{
237 if (!rq->online)
238 return;
239
240 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
241 /*
242 * Make sure the mask is visible before we set
243 * the overload count. That is checked to determine
244 * if we should look at the mask. It would be a shame
245 * if we looked at the mask, but the mask was not
246 * updated yet.
247 */
248 wmb();
249 atomic_inc(&rq->rd->rto_count);
250}
251
252static inline void rt_clear_overload(struct rq *rq)
253{
254 if (!rq->online)
255 return;
256
257 /* the order here really doesn't matter */
258 atomic_dec(&rq->rd->rto_count);
259 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
260}
261
262static void update_rt_migration(struct rt_rq *rt_rq)
263{
264 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
265 if (!rt_rq->overloaded) {
266 rt_set_overload(rq_of_rt_rq(rt_rq));
267 rt_rq->overloaded = 1;
268 }
269 } else if (rt_rq->overloaded) {
270 rt_clear_overload(rq_of_rt_rq(rt_rq));
271 rt_rq->overloaded = 0;
272 }
273}
274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{
277 struct task_struct *p;
278
279 if (!rt_entity_is_task(rt_se))
280 return;
281
282 p = rt_task_of(rt_se);
283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
284
285 rt_rq->rt_nr_total++;
286 if (p->nr_cpus_allowed > 1)
287 rt_rq->rt_nr_migratory++;
288
289 update_rt_migration(rt_rq);
290}
291
292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
293{
294 struct task_struct *p;
295
296 if (!rt_entity_is_task(rt_se))
297 return;
298
299 p = rt_task_of(rt_se);
300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
301
302 rt_rq->rt_nr_total--;
303 if (p->nr_cpus_allowed > 1)
304 rt_rq->rt_nr_migratory--;
305
306 update_rt_migration(rt_rq);
307}
308
309static inline int has_pushable_tasks(struct rq *rq)
310{
311 return !plist_head_empty(&rq->rt.pushable_tasks);
312}
313
314static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
315{
316 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
317 plist_node_init(&p->pushable_tasks, p->prio);
318 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
319
320 /* Update the highest prio pushable task */
321 if (p->prio < rq->rt.highest_prio.next)
322 rq->rt.highest_prio.next = p->prio;
323}
324
325static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
326{
327 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
328
329 /* Update the new highest prio pushable task */
330 if (has_pushable_tasks(rq)) {
331 p = plist_first_entry(&rq->rt.pushable_tasks,
332 struct task_struct, pushable_tasks);
333 rq->rt.highest_prio.next = p->prio;
334 } else
335 rq->rt.highest_prio.next = MAX_RT_PRIO;
336}
337
338#else
339
340static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
341{
342}
343
344static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
345{
346}
347
348static inline
349void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
350{
351}
352
353static inline
354void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
355{
356}
357
358#endif /* CONFIG_SMP */
359
360static inline int on_rt_rq(struct sched_rt_entity *rt_se)
361{
362 return !list_empty(&rt_se->run_list);
363}
364
365#ifdef CONFIG_RT_GROUP_SCHED
366
367static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
368{
369 if (!rt_rq->tg)
370 return RUNTIME_INF;
371
372 return rt_rq->rt_runtime;
373}
374
375static inline u64 sched_rt_period(struct rt_rq *rt_rq)
376{
377 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
378}
379
380typedef struct task_group *rt_rq_iter_t;
381
382static inline struct task_group *next_task_group(struct task_group *tg)
383{
384 do {
385 tg = list_entry_rcu(tg->list.next,
386 typeof(struct task_group), list);
387 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
388
389 if (&tg->list == &task_groups)
390 tg = NULL;
391
392 return tg;
393}
394
395#define for_each_rt_rq(rt_rq, iter, rq) \
396 for (iter = container_of(&task_groups, typeof(*iter), list); \
397 (iter = next_task_group(iter)) && \
398 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
399
400static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
401{
402 list_add_rcu(&rt_rq->leaf_rt_rq_list,
403 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
404}
405
406static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
407{
408 list_del_rcu(&rt_rq->leaf_rt_rq_list);
409}
410
411#define for_each_leaf_rt_rq(rt_rq, rq) \
412 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
413
414#define for_each_sched_rt_entity(rt_se) \
415 for (; rt_se; rt_se = rt_se->parent)
416
417static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
418{
419 return rt_se->my_q;
420}
421
422static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
423static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
424
425static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
426{
427 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
428 struct sched_rt_entity *rt_se;
429
430 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
431
432 rt_se = rt_rq->tg->rt_se[cpu];
433
434 if (rt_rq->rt_nr_running) {
435 if (rt_se && !on_rt_rq(rt_se))
436 enqueue_rt_entity(rt_se, false);
437 if (rt_rq->highest_prio.curr < curr->prio)
438 resched_task(curr);
439 }
440}
441
442static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
443{
444 struct sched_rt_entity *rt_se;
445 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
446
447 rt_se = rt_rq->tg->rt_se[cpu];
448
449 if (rt_se && on_rt_rq(rt_se))
450 dequeue_rt_entity(rt_se);
451}
452
453static inline int rt_rq_throttled(struct rt_rq *rt_rq)
454{
455 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
456}
457
458static int rt_se_boosted(struct sched_rt_entity *rt_se)
459{
460 struct rt_rq *rt_rq = group_rt_rq(rt_se);
461 struct task_struct *p;
462
463 if (rt_rq)
464 return !!rt_rq->rt_nr_boosted;
465
466 p = rt_task_of(rt_se);
467 return p->prio != p->normal_prio;
468}
469
470#ifdef CONFIG_SMP
471static inline const struct cpumask *sched_rt_period_mask(void)
472{
473 return cpu_rq(smp_processor_id())->rd->span;
474}
475#else
476static inline const struct cpumask *sched_rt_period_mask(void)
477{
478 return cpu_online_mask;
479}
480#endif
481
482static inline
483struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
484{
485 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
486}
487
488static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
489{
490 return &rt_rq->tg->rt_bandwidth;
491}
492
493#else /* !CONFIG_RT_GROUP_SCHED */
494
495static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
496{
497 return rt_rq->rt_runtime;
498}
499
500static inline u64 sched_rt_period(struct rt_rq *rt_rq)
501{
502 return ktime_to_ns(def_rt_bandwidth.rt_period);
503}
504
505typedef struct rt_rq *rt_rq_iter_t;
506
507#define for_each_rt_rq(rt_rq, iter, rq) \
508 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
509
510static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
511{
512}
513
514static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
515{
516}
517
518#define for_each_leaf_rt_rq(rt_rq, rq) \
519 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
520
521#define for_each_sched_rt_entity(rt_se) \
522 for (; rt_se; rt_se = NULL)
523
524static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
525{
526 return NULL;
527}
528
529static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
530{
531 if (rt_rq->rt_nr_running)
532 resched_task(rq_of_rt_rq(rt_rq)->curr);
533}
534
535static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
536{
537}
538
539static inline int rt_rq_throttled(struct rt_rq *rt_rq)
540{
541 return rt_rq->rt_throttled;
542}
543
544static inline const struct cpumask *sched_rt_period_mask(void)
545{
546 return cpu_online_mask;
547}
548
549static inline
550struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
551{
552 return &cpu_rq(cpu)->rt;
553}
554
555static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
556{
557 return &def_rt_bandwidth;
558}
559
560#endif /* CONFIG_RT_GROUP_SCHED */
561
562#ifdef CONFIG_SMP
563/*
564 * We ran out of runtime, see if we can borrow some from our neighbours.
565 */
566static int do_balance_runtime(struct rt_rq *rt_rq)
567{
568 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
569 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
570 int i, weight, more = 0;
571 u64 rt_period;
572
573 weight = cpumask_weight(rd->span);
574
575 raw_spin_lock(&rt_b->rt_runtime_lock);
576 rt_period = ktime_to_ns(rt_b->rt_period);
577 for_each_cpu(i, rd->span) {
578 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
579 s64 diff;
580
581 if (iter == rt_rq)
582 continue;
583
584 raw_spin_lock(&iter->rt_runtime_lock);
585 /*
586 * Either all rqs have inf runtime and there's nothing to steal
587 * or __disable_runtime() below sets a specific rq to inf to
588 * indicate its been disabled and disalow stealing.
589 */
590 if (iter->rt_runtime == RUNTIME_INF)
591 goto next;
592
593 /*
594 * From runqueues with spare time, take 1/n part of their
595 * spare time, but no more than our period.
596 */
597 diff = iter->rt_runtime - iter->rt_time;
598 if (diff > 0) {
599 diff = div_u64((u64)diff, weight);
600 if (rt_rq->rt_runtime + diff > rt_period)
601 diff = rt_period - rt_rq->rt_runtime;
602 iter->rt_runtime -= diff;
603 rt_rq->rt_runtime += diff;
604 more = 1;
605 if (rt_rq->rt_runtime == rt_period) {
606 raw_spin_unlock(&iter->rt_runtime_lock);
607 break;
608 }
609 }
610next:
611 raw_spin_unlock(&iter->rt_runtime_lock);
612 }
613 raw_spin_unlock(&rt_b->rt_runtime_lock);
614
615 return more;
616}
617
618/*
619 * Ensure this RQ takes back all the runtime it lend to its neighbours.
620 */
621static void __disable_runtime(struct rq *rq)
622{
623 struct root_domain *rd = rq->rd;
624 rt_rq_iter_t iter;
625 struct rt_rq *rt_rq;
626
627 if (unlikely(!scheduler_running))
628 return;
629
630 for_each_rt_rq(rt_rq, iter, rq) {
631 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
632 s64 want;
633 int i;
634
635 raw_spin_lock(&rt_b->rt_runtime_lock);
636 raw_spin_lock(&rt_rq->rt_runtime_lock);
637 /*
638 * Either we're all inf and nobody needs to borrow, or we're
639 * already disabled and thus have nothing to do, or we have
640 * exactly the right amount of runtime to take out.
641 */
642 if (rt_rq->rt_runtime == RUNTIME_INF ||
643 rt_rq->rt_runtime == rt_b->rt_runtime)
644 goto balanced;
645 raw_spin_unlock(&rt_rq->rt_runtime_lock);
646
647 /*
648 * Calculate the difference between what we started out with
649 * and what we current have, that's the amount of runtime
650 * we lend and now have to reclaim.
651 */
652 want = rt_b->rt_runtime - rt_rq->rt_runtime;
653
654 /*
655 * Greedy reclaim, take back as much as we can.
656 */
657 for_each_cpu(i, rd->span) {
658 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
659 s64 diff;
660
661 /*
662 * Can't reclaim from ourselves or disabled runqueues.
663 */
664 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
665 continue;
666
667 raw_spin_lock(&iter->rt_runtime_lock);
668 if (want > 0) {
669 diff = min_t(s64, iter->rt_runtime, want);
670 iter->rt_runtime -= diff;
671 want -= diff;
672 } else {
673 iter->rt_runtime -= want;
674 want -= want;
675 }
676 raw_spin_unlock(&iter->rt_runtime_lock);
677
678 if (!want)
679 break;
680 }
681
682 raw_spin_lock(&rt_rq->rt_runtime_lock);
683 /*
684 * We cannot be left wanting - that would mean some runtime
685 * leaked out of the system.
686 */
687 BUG_ON(want);
688balanced:
689 /*
690 * Disable all the borrow logic by pretending we have inf
691 * runtime - in which case borrowing doesn't make sense.
692 */
693 rt_rq->rt_runtime = RUNTIME_INF;
694 rt_rq->rt_throttled = 0;
695 raw_spin_unlock(&rt_rq->rt_runtime_lock);
696 raw_spin_unlock(&rt_b->rt_runtime_lock);
697 }
698}
699
700static void disable_runtime(struct rq *rq)
701{
702 unsigned long flags;
703
704 raw_spin_lock_irqsave(&rq->lock, flags);
705 __disable_runtime(rq);
706 raw_spin_unlock_irqrestore(&rq->lock, flags);
707}
708
709static void __enable_runtime(struct rq *rq)
710{
711 rt_rq_iter_t iter;
712 struct rt_rq *rt_rq;
713
714 if (unlikely(!scheduler_running))
715 return;
716
717 /*
718 * Reset each runqueue's bandwidth settings
719 */
720 for_each_rt_rq(rt_rq, iter, rq) {
721 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
722
723 raw_spin_lock(&rt_b->rt_runtime_lock);
724 raw_spin_lock(&rt_rq->rt_runtime_lock);
725 rt_rq->rt_runtime = rt_b->rt_runtime;
726 rt_rq->rt_time = 0;
727 rt_rq->rt_throttled = 0;
728 raw_spin_unlock(&rt_rq->rt_runtime_lock);
729 raw_spin_unlock(&rt_b->rt_runtime_lock);
730 }
731}
732
733static void enable_runtime(struct rq *rq)
734{
735 unsigned long flags;
736
737 raw_spin_lock_irqsave(&rq->lock, flags);
738 __enable_runtime(rq);
739 raw_spin_unlock_irqrestore(&rq->lock, flags);
740}
741
742int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
743{
744 int cpu = (int)(long)hcpu;
745
746 switch (action) {
747 case CPU_DOWN_PREPARE:
748 case CPU_DOWN_PREPARE_FROZEN:
749 disable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 case CPU_DOWN_FAILED:
753 case CPU_DOWN_FAILED_FROZEN:
754 case CPU_ONLINE:
755 case CPU_ONLINE_FROZEN:
756 enable_runtime(cpu_rq(cpu));
757 return NOTIFY_OK;
758
759 default:
760 return NOTIFY_DONE;
761 }
762}
763
764static int balance_runtime(struct rt_rq *rt_rq)
765{
766 int more = 0;
767
768 if (!sched_feat(RT_RUNTIME_SHARE))
769 return more;
770
771 if (rt_rq->rt_time > rt_rq->rt_runtime) {
772 raw_spin_unlock(&rt_rq->rt_runtime_lock);
773 more = do_balance_runtime(rt_rq);
774 raw_spin_lock(&rt_rq->rt_runtime_lock);
775 }
776
777 return more;
778}
779#else /* !CONFIG_SMP */
780static inline int balance_runtime(struct rt_rq *rt_rq)
781{
782 return 0;
783}
784#endif /* CONFIG_SMP */
785
786static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
787{
788 int i, idle = 1, throttled = 0;
789 const struct cpumask *span;
790
791 span = sched_rt_period_mask();
792#ifdef CONFIG_RT_GROUP_SCHED
793 /*
794 * FIXME: isolated CPUs should really leave the root task group,
795 * whether they are isolcpus or were isolated via cpusets, lest
796 * the timer run on a CPU which does not service all runqueues,
797 * potentially leaving other CPUs indefinitely throttled. If
798 * isolation is really required, the user will turn the throttle
799 * off to kill the perturbations it causes anyway. Meanwhile,
800 * this maintains functionality for boot and/or troubleshooting.
801 */
802 if (rt_b == &root_task_group.rt_bandwidth)
803 span = cpu_online_mask;
804#endif
805 for_each_cpu(i, span) {
806 int enqueue = 0;
807 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
808 struct rq *rq = rq_of_rt_rq(rt_rq);
809
810 raw_spin_lock(&rq->lock);
811 if (rt_rq->rt_time) {
812 u64 runtime;
813
814 raw_spin_lock(&rt_rq->rt_runtime_lock);
815 if (rt_rq->rt_throttled)
816 balance_runtime(rt_rq);
817 runtime = rt_rq->rt_runtime;
818 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
819 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
820 rt_rq->rt_throttled = 0;
821 enqueue = 1;
822
823 /*
824 * Force a clock update if the CPU was idle,
825 * lest wakeup -> unthrottle time accumulate.
826 */
827 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
828 rq->skip_clock_update = -1;
829 }
830 if (rt_rq->rt_time || rt_rq->rt_nr_running)
831 idle = 0;
832 raw_spin_unlock(&rt_rq->rt_runtime_lock);
833 } else if (rt_rq->rt_nr_running) {
834 idle = 0;
835 if (!rt_rq_throttled(rt_rq))
836 enqueue = 1;
837 }
838 if (rt_rq->rt_throttled)
839 throttled = 1;
840
841 if (enqueue)
842 sched_rt_rq_enqueue(rt_rq);
843 raw_spin_unlock(&rq->lock);
844 }
845
846 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
847 return 1;
848
849 return idle;
850}
851
852static inline int rt_se_prio(struct sched_rt_entity *rt_se)
853{
854#ifdef CONFIG_RT_GROUP_SCHED
855 struct rt_rq *rt_rq = group_rt_rq(rt_se);
856
857 if (rt_rq)
858 return rt_rq->highest_prio.curr;
859#endif
860
861 return rt_task_of(rt_se)->prio;
862}
863
864static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
865{
866 u64 runtime = sched_rt_runtime(rt_rq);
867
868 if (rt_rq->rt_throttled)
869 return rt_rq_throttled(rt_rq);
870
871 if (runtime >= sched_rt_period(rt_rq))
872 return 0;
873
874 balance_runtime(rt_rq);
875 runtime = sched_rt_runtime(rt_rq);
876 if (runtime == RUNTIME_INF)
877 return 0;
878
879 if (rt_rq->rt_time > runtime) {
880 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
881
882 /*
883 * Don't actually throttle groups that have no runtime assigned
884 * but accrue some time due to boosting.
885 */
886 if (likely(rt_b->rt_runtime)) {
887 static bool once = false;
888
889 rt_rq->rt_throttled = 1;
890
891 if (!once) {
892 once = true;
893 printk_sched("sched: RT throttling activated\n");
894 }
895 } else {
896 /*
897 * In case we did anyway, make it go away,
898 * replenishment is a joke, since it will replenish us
899 * with exactly 0 ns.
900 */
901 rt_rq->rt_time = 0;
902 }
903
904 if (rt_rq_throttled(rt_rq)) {
905 sched_rt_rq_dequeue(rt_rq);
906 return 1;
907 }
908 }
909
910 return 0;
911}
912
913/*
914 * Update the current task's runtime statistics. Skip current tasks that
915 * are not in our scheduling class.
916 */
917static void update_curr_rt(struct rq *rq)
918{
919 struct task_struct *curr = rq->curr;
920 struct sched_rt_entity *rt_se = &curr->rt;
921 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
922 u64 delta_exec;
923
924 if (curr->sched_class != &rt_sched_class)
925 return;
926
927 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0))
929 delta_exec = 0;
930
931 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec));
933
934 curr->se.sum_exec_runtime += delta_exec;
935 account_group_exec_runtime(curr, delta_exec);
936
937 curr->se.exec_start = rq->clock_task;
938 cpuacct_charge(curr, delta_exec);
939
940 sched_rt_avg_update(rq, delta_exec);
941
942 if (!rt_bandwidth_enabled())
943 return;
944
945 for_each_sched_rt_entity(rt_se) {
946 rt_rq = rt_rq_of_se(rt_se);
947
948 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
949 raw_spin_lock(&rt_rq->rt_runtime_lock);
950 rt_rq->rt_time += delta_exec;
951 if (sched_rt_runtime_exceeded(rt_rq))
952 resched_task(curr);
953 raw_spin_unlock(&rt_rq->rt_runtime_lock);
954 }
955 }
956}
957
958#if defined CONFIG_SMP
959
960static void
961inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
962{
963 struct rq *rq = rq_of_rt_rq(rt_rq);
964
965 if (rq->online && prio < prev_prio)
966 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
967}
968
969static void
970dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
971{
972 struct rq *rq = rq_of_rt_rq(rt_rq);
973
974 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
975 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
976}
977
978#else /* CONFIG_SMP */
979
980static inline
981void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
982static inline
983void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
984
985#endif /* CONFIG_SMP */
986
987#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
988static void
989inc_rt_prio(struct rt_rq *rt_rq, int prio)
990{
991 int prev_prio = rt_rq->highest_prio.curr;
992
993 if (prio < prev_prio)
994 rt_rq->highest_prio.curr = prio;
995
996 inc_rt_prio_smp(rt_rq, prio, prev_prio);
997}
998
999static void
1000dec_rt_prio(struct rt_rq *rt_rq, int prio)
1001{
1002 int prev_prio = rt_rq->highest_prio.curr;
1003
1004 if (rt_rq->rt_nr_running) {
1005
1006 WARN_ON(prio < prev_prio);
1007
1008 /*
1009 * This may have been our highest task, and therefore
1010 * we may have some recomputation to do
1011 */
1012 if (prio == prev_prio) {
1013 struct rt_prio_array *array = &rt_rq->active;
1014
1015 rt_rq->highest_prio.curr =
1016 sched_find_first_bit(array->bitmap);
1017 }
1018
1019 } else
1020 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1021
1022 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1023}
1024
1025#else
1026
1027static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1028static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1029
1030#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1031
1032#ifdef CONFIG_RT_GROUP_SCHED
1033
1034static void
1035inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1036{
1037 if (rt_se_boosted(rt_se))
1038 rt_rq->rt_nr_boosted++;
1039
1040 if (rt_rq->tg)
1041 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1042}
1043
1044static void
1045dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1046{
1047 if (rt_se_boosted(rt_se))
1048 rt_rq->rt_nr_boosted--;
1049
1050 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1051}
1052
1053#else /* CONFIG_RT_GROUP_SCHED */
1054
1055static void
1056inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1057{
1058 start_rt_bandwidth(&def_rt_bandwidth);
1059}
1060
1061static inline
1062void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1063
1064#endif /* CONFIG_RT_GROUP_SCHED */
1065
1066static inline
1067void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1068{
1069 int prio = rt_se_prio(rt_se);
1070
1071 WARN_ON(!rt_prio(prio));
1072 rt_rq->rt_nr_running++;
1073
1074 inc_rt_prio(rt_rq, prio);
1075 inc_rt_migration(rt_se, rt_rq);
1076 inc_rt_group(rt_se, rt_rq);
1077}
1078
1079static inline
1080void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1081{
1082 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1083 WARN_ON(!rt_rq->rt_nr_running);
1084 rt_rq->rt_nr_running--;
1085
1086 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1087 dec_rt_migration(rt_se, rt_rq);
1088 dec_rt_group(rt_se, rt_rq);
1089}
1090
1091static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1092{
1093 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1094 struct rt_prio_array *array = &rt_rq->active;
1095 struct rt_rq *group_rq = group_rt_rq(rt_se);
1096 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1097
1098 /*
1099 * Don't enqueue the group if its throttled, or when empty.
1100 * The latter is a consequence of the former when a child group
1101 * get throttled and the current group doesn't have any other
1102 * active members.
1103 */
1104 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1105 return;
1106
1107 if (!rt_rq->rt_nr_running)
1108 list_add_leaf_rt_rq(rt_rq);
1109
1110 if (head)
1111 list_add(&rt_se->run_list, queue);
1112 else
1113 list_add_tail(&rt_se->run_list, queue);
1114 __set_bit(rt_se_prio(rt_se), array->bitmap);
1115
1116 inc_rt_tasks(rt_se, rt_rq);
1117}
1118
1119static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1120{
1121 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1122 struct rt_prio_array *array = &rt_rq->active;
1123
1124 list_del_init(&rt_se->run_list);
1125 if (list_empty(array->queue + rt_se_prio(rt_se)))
1126 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1127
1128 dec_rt_tasks(rt_se, rt_rq);
1129 if (!rt_rq->rt_nr_running)
1130 list_del_leaf_rt_rq(rt_rq);
1131}
1132
1133/*
1134 * Because the prio of an upper entry depends on the lower
1135 * entries, we must remove entries top - down.
1136 */
1137static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1138{
1139 struct sched_rt_entity *back = NULL;
1140
1141 for_each_sched_rt_entity(rt_se) {
1142 rt_se->back = back;
1143 back = rt_se;
1144 }
1145
1146 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1147 if (on_rt_rq(rt_se))
1148 __dequeue_rt_entity(rt_se);
1149 }
1150}
1151
1152static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1153{
1154 dequeue_rt_stack(rt_se);
1155 for_each_sched_rt_entity(rt_se)
1156 __enqueue_rt_entity(rt_se, head);
1157}
1158
1159static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1160{
1161 dequeue_rt_stack(rt_se);
1162
1163 for_each_sched_rt_entity(rt_se) {
1164 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1165
1166 if (rt_rq && rt_rq->rt_nr_running)
1167 __enqueue_rt_entity(rt_se, false);
1168 }
1169}
1170
1171/*
1172 * Adding/removing a task to/from a priority array:
1173 */
1174static void
1175enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1176{
1177 struct sched_rt_entity *rt_se = &p->rt;
1178
1179 if (flags & ENQUEUE_WAKEUP)
1180 rt_se->timeout = 0;
1181
1182 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1183
1184 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1185 enqueue_pushable_task(rq, p);
1186
1187 inc_nr_running(rq);
1188}
1189
1190static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1191{
1192 struct sched_rt_entity *rt_se = &p->rt;
1193
1194 update_curr_rt(rq);
1195 dequeue_rt_entity(rt_se);
1196
1197 dequeue_pushable_task(rq, p);
1198
1199 dec_nr_running(rq);
1200}
1201
1202/*
1203 * Put task to the head or the end of the run list without the overhead of
1204 * dequeue followed by enqueue.
1205 */
1206static void
1207requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1208{
1209 if (on_rt_rq(rt_se)) {
1210 struct rt_prio_array *array = &rt_rq->active;
1211 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1212
1213 if (head)
1214 list_move(&rt_se->run_list, queue);
1215 else
1216 list_move_tail(&rt_se->run_list, queue);
1217 }
1218}
1219
1220static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1221{
1222 struct sched_rt_entity *rt_se = &p->rt;
1223 struct rt_rq *rt_rq;
1224
1225 for_each_sched_rt_entity(rt_se) {
1226 rt_rq = rt_rq_of_se(rt_se);
1227 requeue_rt_entity(rt_rq, rt_se, head);
1228 }
1229}
1230
1231static void yield_task_rt(struct rq *rq)
1232{
1233 requeue_task_rt(rq, rq->curr, 0);
1234}
1235
1236#ifdef CONFIG_SMP
1237static int find_lowest_rq(struct task_struct *task);
1238
1239static int
1240select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1241{
1242 struct task_struct *curr;
1243 struct rq *rq;
1244 int cpu;
1245
1246 cpu = task_cpu(p);
1247
1248 if (p->nr_cpus_allowed == 1)
1249 goto out;
1250
1251 /* For anything but wake ups, just return the task_cpu */
1252 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1253 goto out;
1254
1255 rq = cpu_rq(cpu);
1256
1257 rcu_read_lock();
1258 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1259
1260 /*
1261 * If the current task on @p's runqueue is an RT task, then
1262 * try to see if we can wake this RT task up on another
1263 * runqueue. Otherwise simply start this RT task
1264 * on its current runqueue.
1265 *
1266 * We want to avoid overloading runqueues. If the woken
1267 * task is a higher priority, then it will stay on this CPU
1268 * and the lower prio task should be moved to another CPU.
1269 * Even though this will probably make the lower prio task
1270 * lose its cache, we do not want to bounce a higher task
1271 * around just because it gave up its CPU, perhaps for a
1272 * lock?
1273 *
1274 * For equal prio tasks, we just let the scheduler sort it out.
1275 *
1276 * Otherwise, just let it ride on the affined RQ and the
1277 * post-schedule router will push the preempted task away
1278 *
1279 * This test is optimistic, if we get it wrong the load-balancer
1280 * will have to sort it out.
1281 */
1282 if (curr && unlikely(rt_task(curr)) &&
1283 (curr->nr_cpus_allowed < 2 ||
1284 curr->prio <= p->prio) &&
1285 (p->nr_cpus_allowed > 1)) {
1286 int target = find_lowest_rq(p);
1287
1288 if (target != -1)
1289 cpu = target;
1290 }
1291 rcu_read_unlock();
1292
1293out:
1294 return cpu;
1295}
1296
1297static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1298{
1299 if (rq->curr->nr_cpus_allowed == 1)
1300 return;
1301
1302 if (p->nr_cpus_allowed != 1
1303 && cpupri_find(&rq->rd->cpupri, p, NULL))
1304 return;
1305
1306 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1307 return;
1308
1309 /*
1310 * There appears to be other cpus that can accept
1311 * current and none to run 'p', so lets reschedule
1312 * to try and push current away:
1313 */
1314 requeue_task_rt(rq, p, 1);
1315 resched_task(rq->curr);
1316}
1317
1318#endif /* CONFIG_SMP */
1319
1320/*
1321 * Preempt the current task with a newly woken task if needed:
1322 */
1323static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1324{
1325 if (p->prio < rq->curr->prio) {
1326 resched_task(rq->curr);
1327 return;
1328 }
1329
1330#ifdef CONFIG_SMP
1331 /*
1332 * If:
1333 *
1334 * - the newly woken task is of equal priority to the current task
1335 * - the newly woken task is non-migratable while current is migratable
1336 * - current will be preempted on the next reschedule
1337 *
1338 * we should check to see if current can readily move to a different
1339 * cpu. If so, we will reschedule to allow the push logic to try
1340 * to move current somewhere else, making room for our non-migratable
1341 * task.
1342 */
1343 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1344 check_preempt_equal_prio(rq, p);
1345#endif
1346}
1347
1348static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1349 struct rt_rq *rt_rq)
1350{
1351 struct rt_prio_array *array = &rt_rq->active;
1352 struct sched_rt_entity *next = NULL;
1353 struct list_head *queue;
1354 int idx;
1355
1356 idx = sched_find_first_bit(array->bitmap);
1357 BUG_ON(idx >= MAX_RT_PRIO);
1358
1359 queue = array->queue + idx;
1360 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1361
1362 return next;
1363}
1364
1365static struct task_struct *_pick_next_task_rt(struct rq *rq)
1366{
1367 struct sched_rt_entity *rt_se;
1368 struct task_struct *p;
1369 struct rt_rq *rt_rq;
1370
1371 rt_rq = &rq->rt;
1372
1373 if (!rt_rq->rt_nr_running)
1374 return NULL;
1375
1376 if (rt_rq_throttled(rt_rq))
1377 return NULL;
1378
1379 do {
1380 rt_se = pick_next_rt_entity(rq, rt_rq);
1381 BUG_ON(!rt_se);
1382 rt_rq = group_rt_rq(rt_se);
1383 } while (rt_rq);
1384
1385 p = rt_task_of(rt_se);
1386 p->se.exec_start = rq->clock_task;
1387
1388 return p;
1389}
1390
1391static struct task_struct *pick_next_task_rt(struct rq *rq)
1392{
1393 struct task_struct *p = _pick_next_task_rt(rq);
1394
1395 /* The running task is never eligible for pushing */
1396 if (p)
1397 dequeue_pushable_task(rq, p);
1398
1399#ifdef CONFIG_SMP
1400 /*
1401 * We detect this state here so that we can avoid taking the RQ
1402 * lock again later if there is no need to push
1403 */
1404 rq->post_schedule = has_pushable_tasks(rq);
1405#endif
1406
1407 return p;
1408}
1409
1410static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1411{
1412 update_curr_rt(rq);
1413
1414 /*
1415 * The previous task needs to be made eligible for pushing
1416 * if it is still active
1417 */
1418 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1419 enqueue_pushable_task(rq, p);
1420}
1421
1422#ifdef CONFIG_SMP
1423
1424/* Only try algorithms three times */
1425#define RT_MAX_TRIES 3
1426
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{
1429 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1431 (p->nr_cpus_allowed > 1))
1432 return 1;
1433 return 0;
1434}
1435
1436/* Return the second highest RT task, NULL otherwise */
1437static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1438{
1439 struct task_struct *next = NULL;
1440 struct sched_rt_entity *rt_se;
1441 struct rt_prio_array *array;
1442 struct rt_rq *rt_rq;
1443 int idx;
1444
1445 for_each_leaf_rt_rq(rt_rq, rq) {
1446 array = &rt_rq->active;
1447 idx = sched_find_first_bit(array->bitmap);
1448next_idx:
1449 if (idx >= MAX_RT_PRIO)
1450 continue;
1451 if (next && next->prio <= idx)
1452 continue;
1453 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1454 struct task_struct *p;
1455
1456 if (!rt_entity_is_task(rt_se))
1457 continue;
1458
1459 p = rt_task_of(rt_se);
1460 if (pick_rt_task(rq, p, cpu)) {
1461 next = p;
1462 break;
1463 }
1464 }
1465 if (!next) {
1466 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1467 goto next_idx;
1468 }
1469 }
1470
1471 return next;
1472}
1473
1474static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1475
1476static int find_lowest_rq(struct task_struct *task)
1477{
1478 struct sched_domain *sd;
1479 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1480 int this_cpu = smp_processor_id();
1481 int cpu = task_cpu(task);
1482
1483 /* Make sure the mask is initialized first */
1484 if (unlikely(!lowest_mask))
1485 return -1;
1486
1487 if (task->nr_cpus_allowed == 1)
1488 return -1; /* No other targets possible */
1489
1490 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1491 return -1; /* No targets found */
1492
1493 /*
1494 * At this point we have built a mask of cpus representing the
1495 * lowest priority tasks in the system. Now we want to elect
1496 * the best one based on our affinity and topology.
1497 *
1498 * We prioritize the last cpu that the task executed on since
1499 * it is most likely cache-hot in that location.
1500 */
1501 if (cpumask_test_cpu(cpu, lowest_mask))
1502 return cpu;
1503
1504 /*
1505 * Otherwise, we consult the sched_domains span maps to figure
1506 * out which cpu is logically closest to our hot cache data.
1507 */
1508 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1509 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1510
1511 rcu_read_lock();
1512 for_each_domain(cpu, sd) {
1513 if (sd->flags & SD_WAKE_AFFINE) {
1514 int best_cpu;
1515
1516 /*
1517 * "this_cpu" is cheaper to preempt than a
1518 * remote processor.
1519 */
1520 if (this_cpu != -1 &&
1521 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1522 rcu_read_unlock();
1523 return this_cpu;
1524 }
1525
1526 best_cpu = cpumask_first_and(lowest_mask,
1527 sched_domain_span(sd));
1528 if (best_cpu < nr_cpu_ids) {
1529 rcu_read_unlock();
1530 return best_cpu;
1531 }
1532 }
1533 }
1534 rcu_read_unlock();
1535
1536 /*
1537 * And finally, if there were no matches within the domains
1538 * just give the caller *something* to work with from the compatible
1539 * locations.
1540 */
1541 if (this_cpu != -1)
1542 return this_cpu;
1543
1544 cpu = cpumask_any(lowest_mask);
1545 if (cpu < nr_cpu_ids)
1546 return cpu;
1547 return -1;
1548}
1549
1550/* Will lock the rq it finds */
1551static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1552{
1553 struct rq *lowest_rq = NULL;
1554 int tries;
1555 int cpu;
1556
1557 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1558 cpu = find_lowest_rq(task);
1559
1560 if ((cpu == -1) || (cpu == rq->cpu))
1561 break;
1562
1563 lowest_rq = cpu_rq(cpu);
1564
1565 /* if the prio of this runqueue changed, try again */
1566 if (double_lock_balance(rq, lowest_rq)) {
1567 /*
1568 * We had to unlock the run queue. In
1569 * the mean time, task could have
1570 * migrated already or had its affinity changed.
1571 * Also make sure that it wasn't scheduled on its rq.
1572 */
1573 if (unlikely(task_rq(task) != rq ||
1574 !cpumask_test_cpu(lowest_rq->cpu,
1575 tsk_cpus_allowed(task)) ||
1576 task_running(rq, task) ||
1577 !task->on_rq)) {
1578
1579 double_unlock_balance(rq, lowest_rq);
1580 lowest_rq = NULL;
1581 break;
1582 }
1583 }
1584
1585 /* If this rq is still suitable use it. */
1586 if (lowest_rq->rt.highest_prio.curr > task->prio)
1587 break;
1588
1589 /* try again */
1590 double_unlock_balance(rq, lowest_rq);
1591 lowest_rq = NULL;
1592 }
1593
1594 return lowest_rq;
1595}
1596
1597static struct task_struct *pick_next_pushable_task(struct rq *rq)
1598{
1599 struct task_struct *p;
1600
1601 if (!has_pushable_tasks(rq))
1602 return NULL;
1603
1604 p = plist_first_entry(&rq->rt.pushable_tasks,
1605 struct task_struct, pushable_tasks);
1606
1607 BUG_ON(rq->cpu != task_cpu(p));
1608 BUG_ON(task_current(rq, p));
1609 BUG_ON(p->nr_cpus_allowed <= 1);
1610
1611 BUG_ON(!p->on_rq);
1612 BUG_ON(!rt_task(p));
1613
1614 return p;
1615}
1616
1617/*
1618 * If the current CPU has more than one RT task, see if the non
1619 * running task can migrate over to a CPU that is running a task
1620 * of lesser priority.
1621 */
1622static int push_rt_task(struct rq *rq)
1623{
1624 struct task_struct *next_task;
1625 struct rq *lowest_rq;
1626 int ret = 0;
1627
1628 if (!rq->rt.overloaded)
1629 return 0;
1630
1631 next_task = pick_next_pushable_task(rq);
1632 if (!next_task)
1633 return 0;
1634
1635retry:
1636 if (unlikely(next_task == rq->curr)) {
1637 WARN_ON(1);
1638 return 0;
1639 }
1640
1641 /*
1642 * It's possible that the next_task slipped in of
1643 * higher priority than current. If that's the case
1644 * just reschedule current.
1645 */
1646 if (unlikely(next_task->prio < rq->curr->prio)) {
1647 resched_task(rq->curr);
1648 return 0;
1649 }
1650
1651 /* We might release rq lock */
1652 get_task_struct(next_task);
1653
1654 /* find_lock_lowest_rq locks the rq if found */
1655 lowest_rq = find_lock_lowest_rq(next_task, rq);
1656 if (!lowest_rq) {
1657 struct task_struct *task;
1658 /*
1659 * find_lock_lowest_rq releases rq->lock
1660 * so it is possible that next_task has migrated.
1661 *
1662 * We need to make sure that the task is still on the same
1663 * run-queue and is also still the next task eligible for
1664 * pushing.
1665 */
1666 task = pick_next_pushable_task(rq);
1667 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1668 /*
1669 * The task hasn't migrated, and is still the next
1670 * eligible task, but we failed to find a run-queue
1671 * to push it to. Do not retry in this case, since
1672 * other cpus will pull from us when ready.
1673 */
1674 goto out;
1675 }
1676
1677 if (!task)
1678 /* No more tasks, just exit */
1679 goto out;
1680
1681 /*
1682 * Something has shifted, try again.
1683 */
1684 put_task_struct(next_task);
1685 next_task = task;
1686 goto retry;
1687 }
1688
1689 deactivate_task(rq, next_task, 0);
1690 set_task_cpu(next_task, lowest_rq->cpu);
1691 activate_task(lowest_rq, next_task, 0);
1692 ret = 1;
1693
1694 resched_task(lowest_rq->curr);
1695
1696 double_unlock_balance(rq, lowest_rq);
1697
1698out:
1699 put_task_struct(next_task);
1700
1701 return ret;
1702}
1703
1704static void push_rt_tasks(struct rq *rq)
1705{
1706 /* push_rt_task will return true if it moved an RT */
1707 while (push_rt_task(rq))
1708 ;
1709}
1710
1711static int pull_rt_task(struct rq *this_rq)
1712{
1713 int this_cpu = this_rq->cpu, ret = 0, cpu;
1714 struct task_struct *p;
1715 struct rq *src_rq;
1716
1717 if (likely(!rt_overloaded(this_rq)))
1718 return 0;
1719
1720 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1721 if (this_cpu == cpu)
1722 continue;
1723
1724 src_rq = cpu_rq(cpu);
1725
1726 /*
1727 * Don't bother taking the src_rq->lock if the next highest
1728 * task is known to be lower-priority than our current task.
1729 * This may look racy, but if this value is about to go
1730 * logically higher, the src_rq will push this task away.
1731 * And if its going logically lower, we do not care
1732 */
1733 if (src_rq->rt.highest_prio.next >=
1734 this_rq->rt.highest_prio.curr)
1735 continue;
1736
1737 /*
1738 * We can potentially drop this_rq's lock in
1739 * double_lock_balance, and another CPU could
1740 * alter this_rq
1741 */
1742 double_lock_balance(this_rq, src_rq);
1743
1744 /*
1745 * Are there still pullable RT tasks?
1746 */
1747 if (src_rq->rt.rt_nr_running <= 1)
1748 goto skip;
1749
1750 p = pick_next_highest_task_rt(src_rq, this_cpu);
1751
1752 /*
1753 * Do we have an RT task that preempts
1754 * the to-be-scheduled task?
1755 */
1756 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1757 WARN_ON(p == src_rq->curr);
1758 WARN_ON(!p->on_rq);
1759
1760 /*
1761 * There's a chance that p is higher in priority
1762 * than what's currently running on its cpu.
1763 * This is just that p is wakeing up and hasn't
1764 * had a chance to schedule. We only pull
1765 * p if it is lower in priority than the
1766 * current task on the run queue
1767 */
1768 if (p->prio < src_rq->curr->prio)
1769 goto skip;
1770
1771 ret = 1;
1772
1773 deactivate_task(src_rq, p, 0);
1774 set_task_cpu(p, this_cpu);
1775 activate_task(this_rq, p, 0);
1776 /*
1777 * We continue with the search, just in
1778 * case there's an even higher prio task
1779 * in another runqueue. (low likelihood
1780 * but possible)
1781 */
1782 }
1783skip:
1784 double_unlock_balance(this_rq, src_rq);
1785 }
1786
1787 return ret;
1788}
1789
1790static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1791{
1792 /* Try to pull RT tasks here if we lower this rq's prio */
1793 if (rq->rt.highest_prio.curr > prev->prio)
1794 pull_rt_task(rq);
1795}
1796
1797static void post_schedule_rt(struct rq *rq)
1798{
1799 push_rt_tasks(rq);
1800}
1801
1802/*
1803 * If we are not running and we are not going to reschedule soon, we should
1804 * try to push tasks away now
1805 */
1806static void task_woken_rt(struct rq *rq, struct task_struct *p)
1807{
1808 if (!task_running(rq, p) &&
1809 !test_tsk_need_resched(rq->curr) &&
1810 has_pushable_tasks(rq) &&
1811 p->nr_cpus_allowed > 1 &&
1812 rt_task(rq->curr) &&
1813 (rq->curr->nr_cpus_allowed < 2 ||
1814 rq->curr->prio <= p->prio))
1815 push_rt_tasks(rq);
1816}
1817
1818static void set_cpus_allowed_rt(struct task_struct *p,
1819 const struct cpumask *new_mask)
1820{
1821 struct rq *rq;
1822 int weight;
1823
1824 BUG_ON(!rt_task(p));
1825
1826 if (!p->on_rq)
1827 return;
1828
1829 weight = cpumask_weight(new_mask);
1830
1831 /*
1832 * Only update if the process changes its state from whether it
1833 * can migrate or not.
1834 */
1835 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1836 return;
1837
1838 rq = task_rq(p);
1839
1840 /*
1841 * The process used to be able to migrate OR it can now migrate
1842 */
1843 if (weight <= 1) {
1844 if (!task_current(rq, p))
1845 dequeue_pushable_task(rq, p);
1846 BUG_ON(!rq->rt.rt_nr_migratory);
1847 rq->rt.rt_nr_migratory--;
1848 } else {
1849 if (!task_current(rq, p))
1850 enqueue_pushable_task(rq, p);
1851 rq->rt.rt_nr_migratory++;
1852 }
1853
1854 update_rt_migration(&rq->rt);
1855}
1856
1857/* Assumes rq->lock is held */
1858static void rq_online_rt(struct rq *rq)
1859{
1860 if (rq->rt.overloaded)
1861 rt_set_overload(rq);
1862
1863 __enable_runtime(rq);
1864
1865 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1866}
1867
1868/* Assumes rq->lock is held */
1869static void rq_offline_rt(struct rq *rq)
1870{
1871 if (rq->rt.overloaded)
1872 rt_clear_overload(rq);
1873
1874 __disable_runtime(rq);
1875
1876 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1877}
1878
1879/*
1880 * When switch from the rt queue, we bring ourselves to a position
1881 * that we might want to pull RT tasks from other runqueues.
1882 */
1883static void switched_from_rt(struct rq *rq, struct task_struct *p)
1884{
1885 /*
1886 * If there are other RT tasks then we will reschedule
1887 * and the scheduling of the other RT tasks will handle
1888 * the balancing. But if we are the last RT task
1889 * we may need to handle the pulling of RT tasks
1890 * now.
1891 */
1892 if (p->on_rq && !rq->rt.rt_nr_running)
1893 pull_rt_task(rq);
1894}
1895
1896void init_sched_rt_class(void)
1897{
1898 unsigned int i;
1899
1900 for_each_possible_cpu(i) {
1901 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1902 GFP_KERNEL, cpu_to_node(i));
1903 }
1904}
1905#endif /* CONFIG_SMP */
1906
1907/*
1908 * When switching a task to RT, we may overload the runqueue
1909 * with RT tasks. In this case we try to push them off to
1910 * other runqueues.
1911 */
1912static void switched_to_rt(struct rq *rq, struct task_struct *p)
1913{
1914 int check_resched = 1;
1915
1916 /*
1917 * If we are already running, then there's nothing
1918 * that needs to be done. But if we are not running
1919 * we may need to preempt the current running task.
1920 * If that current running task is also an RT task
1921 * then see if we can move to another run queue.
1922 */
1923 if (p->on_rq && rq->curr != p) {
1924#ifdef CONFIG_SMP
1925 if (rq->rt.overloaded && push_rt_task(rq) &&
1926 /* Don't resched if we changed runqueues */
1927 rq != task_rq(p))
1928 check_resched = 0;
1929#endif /* CONFIG_SMP */
1930 if (check_resched && p->prio < rq->curr->prio)
1931 resched_task(rq->curr);
1932 }
1933}
1934
1935/*
1936 * Priority of the task has changed. This may cause
1937 * us to initiate a push or pull.
1938 */
1939static void
1940prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1941{
1942 if (!p->on_rq)
1943 return;
1944
1945 if (rq->curr == p) {
1946#ifdef CONFIG_SMP
1947 /*
1948 * If our priority decreases while running, we
1949 * may need to pull tasks to this runqueue.
1950 */
1951 if (oldprio < p->prio)
1952 pull_rt_task(rq);
1953 /*
1954 * If there's a higher priority task waiting to run
1955 * then reschedule. Note, the above pull_rt_task
1956 * can release the rq lock and p could migrate.
1957 * Only reschedule if p is still on the same runqueue.
1958 */
1959 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1960 resched_task(p);
1961#else
1962 /* For UP simply resched on drop of prio */
1963 if (oldprio < p->prio)
1964 resched_task(p);
1965#endif /* CONFIG_SMP */
1966 } else {
1967 /*
1968 * This task is not running, but if it is
1969 * greater than the current running task
1970 * then reschedule.
1971 */
1972 if (p->prio < rq->curr->prio)
1973 resched_task(rq->curr);
1974 }
1975}
1976
1977static void watchdog(struct rq *rq, struct task_struct *p)
1978{
1979 unsigned long soft, hard;
1980
1981 /* max may change after cur was read, this will be fixed next tick */
1982 soft = task_rlimit(p, RLIMIT_RTTIME);
1983 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1984
1985 if (soft != RLIM_INFINITY) {
1986 unsigned long next;
1987
1988 p->rt.timeout++;
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1992 }
1993}
1994
1995static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1996{
1997 struct sched_rt_entity *rt_se = &p->rt;
1998
1999 update_curr_rt(rq);
2000
2001 watchdog(rq, p);
2002
2003 /*
2004 * RR tasks need a special form of timeslice management.
2005 * FIFO tasks have no timeslices.
2006 */
2007 if (p->policy != SCHED_RR)
2008 return;
2009
2010 if (--p->rt.time_slice)
2011 return;
2012
2013 p->rt.time_slice = RR_TIMESLICE;
2014
2015 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the
2017 * only element on the queue
2018 */
2019 for_each_sched_rt_entity(rt_se) {
2020 if (rt_se->run_list.prev != rt_se->run_list.next) {
2021 requeue_task_rt(rq, p, 0);
2022 set_tsk_need_resched(p);
2023 return;
2024 }
2025 }
2026}
2027
2028static void set_curr_task_rt(struct rq *rq)
2029{
2030 struct task_struct *p = rq->curr;
2031
2032 p->se.exec_start = rq->clock_task;
2033
2034 /* The running task is never eligible for pushing */
2035 dequeue_pushable_task(rq, p);
2036}
2037
2038static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2039{
2040 /*
2041 * Time slice is 0 for SCHED_FIFO tasks
2042 */
2043 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE;
2045 else
2046 return 0;
2047}
2048
2049const struct sched_class rt_sched_class = {
2050 .next = &fair_sched_class,
2051 .enqueue_task = enqueue_task_rt,
2052 .dequeue_task = dequeue_task_rt,
2053 .yield_task = yield_task_rt,
2054
2055 .check_preempt_curr = check_preempt_curr_rt,
2056
2057 .pick_next_task = pick_next_task_rt,
2058 .put_prev_task = put_prev_task_rt,
2059
2060#ifdef CONFIG_SMP
2061 .select_task_rq = select_task_rq_rt,
2062
2063 .set_cpus_allowed = set_cpus_allowed_rt,
2064 .rq_online = rq_online_rt,
2065 .rq_offline = rq_offline_rt,
2066 .pre_schedule = pre_schedule_rt,
2067 .post_schedule = post_schedule_rt,
2068 .task_woken = task_woken_rt,
2069 .switched_from = switched_from_rt,
2070#endif
2071
2072 .set_curr_task = set_curr_task_rt,
2073 .task_tick = task_tick_rt,
2074
2075 .get_rr_interval = get_rr_interval_rt,
2076
2077 .prio_changed = prio_changed_rt,
2078 .switched_to = switched_to_rt,
2079};
2080
2081#ifdef CONFIG_SCHED_DEBUG
2082extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2083
2084void print_rt_stats(struct seq_file *m, int cpu)
2085{
2086 rt_rq_iter_t iter;
2087 struct rt_rq *rt_rq;
2088
2089 rcu_read_lock();
2090 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2091 print_rt_rq(m, cpu, rt_rq);
2092 rcu_read_unlock();
2093}
2094#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
deleted file mode 100644
index fc886441436..00000000000
--- a/kernel/sched/sched.h
+++ /dev/null
@@ -1,1241 +0,0 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 */
40
41/*
42 * single value that denotes runtime == period, ie unlimited time.
43 */
44#define RUNTIME_INF ((u64)~0ULL)
45
46static inline int rt_policy(int policy)
47{
48 if (policy == SCHED_FIFO || policy == SCHED_RR)
49 return 1;
50 return 0;
51}
52
53static inline int task_has_rt_policy(struct task_struct *p)
54{
55 return rt_policy(p->policy);
56}
57
58/*
59 * This is the priority-queue data structure of the RT scheduling class:
60 */
61struct rt_prio_array {
62 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
63 struct list_head queue[MAX_RT_PRIO];
64};
65
66struct rt_bandwidth {
67 /* nests inside the rq lock: */
68 raw_spinlock_t rt_runtime_lock;
69 ktime_t rt_period;
70 u64 rt_runtime;
71 struct hrtimer rt_period_timer;
72};
73
74extern struct mutex sched_domains_mutex;
75
76#ifdef CONFIG_CGROUP_SCHED
77
78#include <linux/cgroup.h>
79
80struct cfs_rq;
81struct rt_rq;
82
83extern struct list_head task_groups;
84
85struct cfs_bandwidth {
86#ifdef CONFIG_CFS_BANDWIDTH
87 raw_spinlock_t lock;
88 ktime_t period;
89 u64 quota, runtime;
90 s64 hierarchal_quota;
91 u64 runtime_expires;
92
93 int idle, timer_active;
94 struct hrtimer period_timer, slack_timer;
95 struct list_head throttled_cfs_rq;
96
97 /* statistics */
98 int nr_periods, nr_throttled;
99 u64 throttled_time;
100#endif
101};
102
103/* task group related information */
104struct task_group {
105 struct cgroup_subsys_state css;
106
107#ifdef CONFIG_FAIR_GROUP_SCHED
108 /* schedulable entities of this group on each cpu */
109 struct sched_entity **se;
110 /* runqueue "owned" by this group on each cpu */
111 struct cfs_rq **cfs_rq;
112 unsigned long shares;
113
114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
117#endif
118
119#ifdef CONFIG_RT_GROUP_SCHED
120 struct sched_rt_entity **rt_se;
121 struct rt_rq **rt_rq;
122
123 struct rt_bandwidth rt_bandwidth;
124#endif
125
126 struct rcu_head rcu;
127 struct list_head list;
128
129 struct task_group *parent;
130 struct list_head siblings;
131 struct list_head children;
132
133#ifdef CONFIG_SCHED_AUTOGROUP
134 struct autogroup *autogroup;
135#endif
136
137 struct cfs_bandwidth cfs_bandwidth;
138};
139
140#ifdef CONFIG_FAIR_GROUP_SCHED
141#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
142
143/*
144 * A weight of 0 or 1 can cause arithmetics problems.
145 * A weight of a cfs_rq is the sum of weights of which entities
146 * are queued on this cfs_rq, so a weight of a entity should not be
147 * too large, so as the shares value of a task group.
148 * (The default weight is 1024 - so there's no practical
149 * limitation from this.)
150 */
151#define MIN_SHARES (1UL << 1)
152#define MAX_SHARES (1UL << 18)
153#endif
154
155/* Default task group.
156 * Every task in system belong to this group at bootup.
157 */
158extern struct task_group root_task_group;
159
160typedef int (*tg_visitor)(struct task_group *, void *);
161
162extern int walk_tg_tree_from(struct task_group *from,
163 tg_visitor down, tg_visitor up, void *data);
164
165/*
166 * Iterate the full tree, calling @down when first entering a node and @up when
167 * leaving it for the final time.
168 *
169 * Caller must hold rcu_lock or sufficient equivalent.
170 */
171static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
172{
173 return walk_tg_tree_from(&root_task_group, down, up, data);
174}
175
176extern int tg_nop(struct task_group *tg, void *data);
177
178extern void free_fair_sched_group(struct task_group *tg);
179extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
180extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
181extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
182 struct sched_entity *se, int cpu,
183 struct sched_entity *parent);
184extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
185extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
186
187extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
188extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
189extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
190
191extern void free_rt_sched_group(struct task_group *tg);
192extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
193extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
194 struct sched_rt_entity *rt_se, int cpu,
195 struct sched_rt_entity *parent);
196
197#else /* CONFIG_CGROUP_SCHED */
198
199struct cfs_bandwidth { };
200
201#endif /* CONFIG_CGROUP_SCHED */
202
203/* CFS-related fields in a runqueue */
204struct cfs_rq {
205 struct load_weight load;
206 unsigned int nr_running, h_nr_running;
207
208 u64 exec_clock;
209 u64 min_vruntime;
210#ifndef CONFIG_64BIT
211 u64 min_vruntime_copy;
212#endif
213
214 struct rb_root tasks_timeline;
215 struct rb_node *rb_leftmost;
216
217 /*
218 * 'curr' points to currently running entity on this cfs_rq.
219 * It is set to NULL otherwise (i.e when none are currently running).
220 */
221 struct sched_entity *curr, *next, *last, *skip;
222
223#ifdef CONFIG_SCHED_DEBUG
224 unsigned int nr_spread_over;
225#endif
226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
233#ifdef CONFIG_FAIR_GROUP_SCHED
234 /*
235 * CFS Load tracking
236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
237 * This allows for the description of both thread and group usage (in
238 * the FAIR_GROUP_SCHED case).
239 */
240 u64 runnable_load_avg, blocked_load_avg;
241 atomic64_t decay_counter, removed_load;
242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
249
250 /*
251 * h_load = weight * f(tg)
252 *
253 * Where f(tg) is the recursive weight fraction assigned to
254 * this group.
255 */
256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
261
262 /*
263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
266 *
267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
268 * list is used during load balance.
269 */
270 int on_list;
271 struct list_head leaf_cfs_rq_list;
272 struct task_group *tg; /* group that "owns" this runqueue */
273
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
281 int throttled, throttle_count;
282 struct list_head throttled_list;
283#endif /* CONFIG_CFS_BANDWIDTH */
284#endif /* CONFIG_FAIR_GROUP_SCHED */
285};
286
287static inline int rt_bandwidth_enabled(void)
288{
289 return sysctl_sched_rt_runtime >= 0;
290}
291
292/* Real-Time classes' related field in a runqueue: */
293struct rt_rq {
294 struct rt_prio_array active;
295 unsigned int rt_nr_running;
296#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
297 struct {
298 int curr; /* highest queued rt task prio */
299#ifdef CONFIG_SMP
300 int next; /* next highest */
301#endif
302 } highest_prio;
303#endif
304#ifdef CONFIG_SMP
305 unsigned long rt_nr_migratory;
306 unsigned long rt_nr_total;
307 int overloaded;
308 struct plist_head pushable_tasks;
309#endif
310 int rt_throttled;
311 u64 rt_time;
312 u64 rt_runtime;
313 /* Nests inside the rq lock: */
314 raw_spinlock_t rt_runtime_lock;
315
316#ifdef CONFIG_RT_GROUP_SCHED
317 unsigned long rt_nr_boosted;
318
319 struct rq *rq;
320 struct list_head leaf_rt_rq_list;
321 struct task_group *tg;
322#endif
323};
324
325#ifdef CONFIG_SMP
326
327/*
328 * We add the notion of a root-domain which will be used to define per-domain
329 * variables. Each exclusive cpuset essentially defines an island domain by
330 * fully partitioning the member cpus from any other cpuset. Whenever a new
331 * exclusive cpuset is created, we also create and attach a new root-domain
332 * object.
333 *
334 */
335struct root_domain {
336 atomic_t refcount;
337 atomic_t rto_count;
338 struct rcu_head rcu;
339 cpumask_var_t span;
340 cpumask_var_t online;
341
342 /*
343 * The "RT overload" flag: it gets set if a CPU has more than
344 * one runnable RT task.
345 */
346 cpumask_var_t rto_mask;
347 struct cpupri cpupri;
348};
349
350extern struct root_domain def_root_domain;
351
352#endif /* CONFIG_SMP */
353
354/*
355 * This is the main, per-CPU runqueue data structure.
356 *
357 * Locking rule: those places that want to lock multiple runqueues
358 * (such as the load balancing or the thread migration code), lock
359 * acquire operations must be ordered by ascending &runqueue.
360 */
361struct rq {
362 /* runqueue lock: */
363 raw_spinlock_t lock;
364
365 /*
366 * nr_running and cpu_load should be in the same cacheline because
367 * remote CPUs use both these fields when doing load calculation.
368 */
369 unsigned int nr_running;
370 #define CPU_LOAD_IDX_MAX 5
371 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
372 unsigned long last_load_update_tick;
373#ifdef CONFIG_NO_HZ
374 u64 nohz_stamp;
375 unsigned long nohz_flags;
376#endif
377 int skip_clock_update;
378
379 /* capture load from *all* tasks on this cpu: */
380 struct load_weight load;
381 unsigned long nr_load_updates;
382 u64 nr_switches;
383
384 struct cfs_rq cfs;
385 struct rt_rq rt;
386
387#ifdef CONFIG_FAIR_GROUP_SCHED
388 /* list of leaf cfs_rq on this cpu: */
389 struct list_head leaf_cfs_rq_list;
390#ifdef CONFIG_SMP
391 unsigned long h_load_throttle;
392#endif /* CONFIG_SMP */
393#endif /* CONFIG_FAIR_GROUP_SCHED */
394
395#ifdef CONFIG_RT_GROUP_SCHED
396 struct list_head leaf_rt_rq_list;
397#endif
398
399 /*
400 * This is part of a global counter where only the total sum
401 * over all CPUs matters. A task can increase this counter on
402 * one CPU and if it got migrated afterwards it may decrease
403 * it on another CPU. Always updated under the runqueue lock:
404 */
405 unsigned long nr_uninterruptible;
406
407 struct task_struct *curr, *idle, *stop;
408 unsigned long next_balance;
409 struct mm_struct *prev_mm;
410
411 u64 clock;
412 u64 clock_task;
413
414 atomic_t nr_iowait;
415
416#ifdef CONFIG_SMP
417 struct root_domain *rd;
418 struct sched_domain *sd;
419
420 unsigned long cpu_power;
421
422 unsigned char idle_balance;
423 /* For active balancing */
424 int post_schedule;
425 int active_balance;
426 int push_cpu;
427 struct cpu_stop_work active_balance_work;
428 /* cpu of this runqueue: */
429 int cpu;
430 int online;
431
432 struct list_head cfs_tasks;
433
434 u64 rt_avg;
435 u64 age_stamp;
436 u64 idle_stamp;
437 u64 avg_idle;
438#endif
439
440#ifdef CONFIG_IRQ_TIME_ACCOUNTING
441 u64 prev_irq_time;
442#endif
443#ifdef CONFIG_PARAVIRT
444 u64 prev_steal_time;
445#endif
446#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
447 u64 prev_steal_time_rq;
448#endif
449
450 /* calc_load related fields */
451 unsigned long calc_load_update;
452 long calc_load_active;
453
454#ifdef CONFIG_SCHED_HRTICK
455#ifdef CONFIG_SMP
456 int hrtick_csd_pending;
457 struct call_single_data hrtick_csd;
458#endif
459 struct hrtimer hrtick_timer;
460#endif
461
462#ifdef CONFIG_SCHEDSTATS
463 /* latency stats */
464 struct sched_info rq_sched_info;
465 unsigned long long rq_cpu_time;
466 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
467
468 /* sys_sched_yield() stats */
469 unsigned int yld_count;
470
471 /* schedule() stats */
472 unsigned int sched_count;
473 unsigned int sched_goidle;
474
475 /* try_to_wake_up() stats */
476 unsigned int ttwu_count;
477 unsigned int ttwu_local;
478#endif
479
480#ifdef CONFIG_SMP
481 struct llist_head wake_list;
482#endif
483
484 struct sched_avg avg;
485};
486
487static inline int cpu_of(struct rq *rq)
488{
489#ifdef CONFIG_SMP
490 return rq->cpu;
491#else
492 return 0;
493#endif
494}
495
496DECLARE_PER_CPU(struct rq, runqueues);
497
498#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
499#define this_rq() (&__get_cpu_var(runqueues))
500#define task_rq(p) cpu_rq(task_cpu(p))
501#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
502#define raw_rq() (&__raw_get_cpu_var(runqueues))
503
504#ifdef CONFIG_SMP
505
506#define rcu_dereference_check_sched_domain(p) \
507 rcu_dereference_check((p), \
508 lockdep_is_held(&sched_domains_mutex))
509
510/*
511 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
512 * See detach_destroy_domains: synchronize_sched for details.
513 *
514 * The domain tree of any CPU may only be accessed from within
515 * preempt-disabled sections.
516 */
517#define for_each_domain(cpu, __sd) \
518 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
519 __sd; __sd = __sd->parent)
520
521#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
522
523/**
524 * highest_flag_domain - Return highest sched_domain containing flag.
525 * @cpu: The cpu whose highest level of sched domain is to
526 * be returned.
527 * @flag: The flag to check for the highest sched_domain
528 * for the given cpu.
529 *
530 * Returns the highest sched_domain of a cpu which contains the given flag.
531 */
532static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
533{
534 struct sched_domain *sd, *hsd = NULL;
535
536 for_each_domain(cpu, sd) {
537 if (!(sd->flags & flag))
538 break;
539 hsd = sd;
540 }
541
542 return hsd;
543}
544
545DECLARE_PER_CPU(struct sched_domain *, sd_llc);
546DECLARE_PER_CPU(int, sd_llc_id);
547
548extern int group_balance_cpu(struct sched_group *sg);
549
550#endif /* CONFIG_SMP */
551
552#include "stats.h"
553#include "auto_group.h"
554
555#ifdef CONFIG_CGROUP_SCHED
556
557/*
558 * Return the group to which this tasks belongs.
559 *
560 * We cannot use task_subsys_state() and friends because the cgroup
561 * subsystem changes that value before the cgroup_subsys::attach() method
562 * is called, therefore we cannot pin it and might observe the wrong value.
563 *
564 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
565 * core changes this before calling sched_move_task().
566 *
567 * Instead we use a 'copy' which is updated from sched_move_task() while
568 * holding both task_struct::pi_lock and rq::lock.
569 */
570static inline struct task_group *task_group(struct task_struct *p)
571{
572 return p->sched_task_group;
573}
574
575/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
576static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
577{
578#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
579 struct task_group *tg = task_group(p);
580#endif
581
582#ifdef CONFIG_FAIR_GROUP_SCHED
583 p->se.cfs_rq = tg->cfs_rq[cpu];
584 p->se.parent = tg->se[cpu];
585#endif
586
587#ifdef CONFIG_RT_GROUP_SCHED
588 p->rt.rt_rq = tg->rt_rq[cpu];
589 p->rt.parent = tg->rt_se[cpu];
590#endif
591}
592
593#else /* CONFIG_CGROUP_SCHED */
594
595static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
596static inline struct task_group *task_group(struct task_struct *p)
597{
598 return NULL;
599}
600
601#endif /* CONFIG_CGROUP_SCHED */
602
603static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
604{
605 set_task_rq(p, cpu);
606#ifdef CONFIG_SMP
607 /*
608 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
609 * successfuly executed on another CPU. We must ensure that updates of
610 * per-task data have been completed by this moment.
611 */
612 smp_wmb();
613 task_thread_info(p)->cpu = cpu;
614#endif
615}
616
617/*
618 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
619 */
620#ifdef CONFIG_SCHED_DEBUG
621# include <linux/static_key.h>
622# define const_debug __read_mostly
623#else
624# define const_debug const
625#endif
626
627extern const_debug unsigned int sysctl_sched_features;
628
629#define SCHED_FEAT(name, enabled) \
630 __SCHED_FEAT_##name ,
631
632enum {
633#include "features.h"
634 __SCHED_FEAT_NR,
635};
636
637#undef SCHED_FEAT
638
639#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
640static __always_inline bool static_branch__true(struct static_key *key)
641{
642 return static_key_true(key); /* Not out of line branch. */
643}
644
645static __always_inline bool static_branch__false(struct static_key *key)
646{
647 return static_key_false(key); /* Out of line branch. */
648}
649
650#define SCHED_FEAT(name, enabled) \
651static __always_inline bool static_branch_##name(struct static_key *key) \
652{ \
653 return static_branch__##enabled(key); \
654}
655
656#include "features.h"
657
658#undef SCHED_FEAT
659
660extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
661#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
662#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
678static inline u64 global_rt_period(void)
679{
680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
681}
682
683static inline u64 global_rt_runtime(void)
684{
685 if (sysctl_sched_rt_runtime < 0)
686 return RUNTIME_INF;
687
688 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
689}
690
691
692
693static inline int task_current(struct rq *rq, struct task_struct *p)
694{
695 return rq->curr == p;
696}
697
698static inline int task_running(struct rq *rq, struct task_struct *p)
699{
700#ifdef CONFIG_SMP
701 return p->on_cpu;
702#else
703 return task_current(rq, p);
704#endif
705}
706
707
708#ifndef prepare_arch_switch
709# define prepare_arch_switch(next) do { } while (0)
710#endif
711#ifndef finish_arch_switch
712# define finish_arch_switch(prev) do { } while (0)
713#endif
714#ifndef finish_arch_post_lock_switch
715# define finish_arch_post_lock_switch() do { } while (0)
716#endif
717
718#ifndef __ARCH_WANT_UNLOCKED_CTXSW
719static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
720{
721#ifdef CONFIG_SMP
722 /*
723 * We can optimise this out completely for !SMP, because the
724 * SMP rebalancing from interrupt is the only thing that cares
725 * here.
726 */
727 next->on_cpu = 1;
728#endif
729}
730
731static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
732{
733#ifdef CONFIG_SMP
734 /*
735 * After ->on_cpu is cleared, the task can be moved to a different CPU.
736 * We must ensure this doesn't happen until the switch is completely
737 * finished.
738 */
739 smp_wmb();
740 prev->on_cpu = 0;
741#endif
742#ifdef CONFIG_DEBUG_SPINLOCK
743 /* this is a valid case when another task releases the spinlock */
744 rq->lock.owner = current;
745#endif
746 /*
747 * If we are tracking spinlock dependencies then we have to
748 * fix up the runqueue lock - which gets 'carried over' from
749 * prev into current:
750 */
751 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
752
753 raw_spin_unlock_irq(&rq->lock);
754}
755
756#else /* __ARCH_WANT_UNLOCKED_CTXSW */
757static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
758{
759#ifdef CONFIG_SMP
760 /*
761 * We can optimise this out completely for !SMP, because the
762 * SMP rebalancing from interrupt is the only thing that cares
763 * here.
764 */
765 next->on_cpu = 1;
766#endif
767 raw_spin_unlock(&rq->lock);
768}
769
770static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
771{
772#ifdef CONFIG_SMP
773 /*
774 * After ->on_cpu is cleared, the task can be moved to a different CPU.
775 * We must ensure this doesn't happen until the switch is completely
776 * finished.
777 */
778 smp_wmb();
779 prev->on_cpu = 0;
780#endif
781 local_irq_enable();
782}
783#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
784
785
786static inline void update_load_add(struct load_weight *lw, unsigned long inc)
787{
788 lw->weight += inc;
789 lw->inv_weight = 0;
790}
791
792static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
793{
794 lw->weight -= dec;
795 lw->inv_weight = 0;
796}
797
798static inline void update_load_set(struct load_weight *lw, unsigned long w)
799{
800 lw->weight = w;
801 lw->inv_weight = 0;
802}
803
804/*
805 * To aid in avoiding the subversion of "niceness" due to uneven distribution
806 * of tasks with abnormal "nice" values across CPUs the contribution that
807 * each task makes to its run queue's load is weighted according to its
808 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
809 * scaled version of the new time slice allocation that they receive on time
810 * slice expiry etc.
811 */
812
813#define WEIGHT_IDLEPRIO 3
814#define WMULT_IDLEPRIO 1431655765
815
816/*
817 * Nice levels are multiplicative, with a gentle 10% change for every
818 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
819 * nice 1, it will get ~10% less CPU time than another CPU-bound task
820 * that remained on nice 0.
821 *
822 * The "10% effect" is relative and cumulative: from _any_ nice level,
823 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
824 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
825 * If a task goes up by ~10% and another task goes down by ~10% then
826 * the relative distance between them is ~25%.)
827 */
828static const int prio_to_weight[40] = {
829 /* -20 */ 88761, 71755, 56483, 46273, 36291,
830 /* -15 */ 29154, 23254, 18705, 14949, 11916,
831 /* -10 */ 9548, 7620, 6100, 4904, 3906,
832 /* -5 */ 3121, 2501, 1991, 1586, 1277,
833 /* 0 */ 1024, 820, 655, 526, 423,
834 /* 5 */ 335, 272, 215, 172, 137,
835 /* 10 */ 110, 87, 70, 56, 45,
836 /* 15 */ 36, 29, 23, 18, 15,
837};
838
839/*
840 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
841 *
842 * In cases where the weight does not change often, we can use the
843 * precalculated inverse to speed up arithmetics by turning divisions
844 * into multiplications:
845 */
846static const u32 prio_to_wmult[40] = {
847 /* -20 */ 48388, 59856, 76040, 92818, 118348,
848 /* -15 */ 147320, 184698, 229616, 287308, 360437,
849 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
850 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
851 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
852 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
853 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
854 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
855};
856
857/* Time spent by the tasks of the cpu accounting group executing in ... */
858enum cpuacct_stat_index {
859 CPUACCT_STAT_USER, /* ... user mode */
860 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
861
862 CPUACCT_STAT_NSTATS,
863};
864
865
866#define sched_class_highest (&stop_sched_class)
867#define for_each_class(class) \
868 for (class = sched_class_highest; class; class = class->next)
869
870extern const struct sched_class stop_sched_class;
871extern const struct sched_class rt_sched_class;
872extern const struct sched_class fair_sched_class;
873extern const struct sched_class idle_sched_class;
874
875
876#ifdef CONFIG_SMP
877
878extern void trigger_load_balance(struct rq *rq, int cpu);
879extern void idle_balance(int this_cpu, struct rq *this_rq);
880
881#else /* CONFIG_SMP */
882
883static inline void idle_balance(int cpu, struct rq *rq)
884{
885}
886
887#endif
888
889extern void sysrq_sched_debug_show(void);
890extern void sched_init_granularity(void);
891extern void update_max_interval(void);
892extern void update_group_power(struct sched_domain *sd, int cpu);
893extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
894extern void init_sched_rt_class(void);
895extern void init_sched_fair_class(void);
896
897extern void resched_task(struct task_struct *p);
898extern void resched_cpu(int cpu);
899
900extern struct rt_bandwidth def_rt_bandwidth;
901extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
902
903extern void update_idle_cpu_load(struct rq *this_rq);
904
905#ifdef CONFIG_CGROUP_CPUACCT
906#include <linux/cgroup.h>
907/* track cpu usage of a group of tasks and its child groups */
908struct cpuacct {
909 struct cgroup_subsys_state css;
910 /* cpuusage holds pointer to a u64-type object on every cpu */
911 u64 __percpu *cpuusage;
912 struct kernel_cpustat __percpu *cpustat;
913};
914
915extern struct cgroup_subsys cpuacct_subsys;
916extern struct cpuacct root_cpuacct;
917
918/* return cpu accounting group corresponding to this container */
919static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
920{
921 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
922 struct cpuacct, css);
923}
924
925/* return cpu accounting group to which this task belongs */
926static inline struct cpuacct *task_ca(struct task_struct *tsk)
927{
928 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
929 struct cpuacct, css);
930}
931
932static inline struct cpuacct *parent_ca(struct cpuacct *ca)
933{
934 if (!ca || !ca->css.cgroup->parent)
935 return NULL;
936 return cgroup_ca(ca->css.cgroup->parent);
937}
938
939extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
940#else
941static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
942#endif
943
944#ifdef CONFIG_PARAVIRT
945static inline u64 steal_ticks(u64 steal)
946{
947 if (unlikely(steal > NSEC_PER_SEC))
948 return div_u64(steal, TICK_NSEC);
949
950 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
951}
952#endif
953
954static inline void inc_nr_running(struct rq *rq)
955{
956 rq->nr_running++;
957}
958
959static inline void dec_nr_running(struct rq *rq)
960{
961 rq->nr_running--;
962}
963
964extern void update_rq_clock(struct rq *rq);
965
966extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
967extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
968
969extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
970
971extern const_debug unsigned int sysctl_sched_time_avg;
972extern const_debug unsigned int sysctl_sched_nr_migrate;
973extern const_debug unsigned int sysctl_sched_migration_cost;
974
975static inline u64 sched_avg_period(void)
976{
977 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
978}
979
980#ifdef CONFIG_SCHED_HRTICK
981
982/*
983 * Use hrtick when:
984 * - enabled by features
985 * - hrtimer is actually high res
986 */
987static inline int hrtick_enabled(struct rq *rq)
988{
989 if (!sched_feat(HRTICK))
990 return 0;
991 if (!cpu_active(cpu_of(rq)))
992 return 0;
993 return hrtimer_is_hres_active(&rq->hrtick_timer);
994}
995
996void hrtick_start(struct rq *rq, u64 delay);
997
998#else
999
1000static inline int hrtick_enabled(struct rq *rq)
1001{
1002 return 0;
1003}
1004
1005#endif /* CONFIG_SCHED_HRTICK */
1006
1007#ifdef CONFIG_SMP
1008extern void sched_avg_update(struct rq *rq);
1009static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1010{
1011 rq->rt_avg += rt_delta;
1012 sched_avg_update(rq);
1013}
1014#else
1015static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1016static inline void sched_avg_update(struct rq *rq) { }
1017#endif
1018
1019extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1020
1021#ifdef CONFIG_SMP
1022#ifdef CONFIG_PREEMPT
1023
1024static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1025
1026/*
1027 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1028 * way at the expense of forcing extra atomic operations in all
1029 * invocations. This assures that the double_lock is acquired using the
1030 * same underlying policy as the spinlock_t on this architecture, which
1031 * reduces latency compared to the unfair variant below. However, it
1032 * also adds more overhead and therefore may reduce throughput.
1033 */
1034static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1035 __releases(this_rq->lock)
1036 __acquires(busiest->lock)
1037 __acquires(this_rq->lock)
1038{
1039 raw_spin_unlock(&this_rq->lock);
1040 double_rq_lock(this_rq, busiest);
1041
1042 return 1;
1043}
1044
1045#else
1046/*
1047 * Unfair double_lock_balance: Optimizes throughput at the expense of
1048 * latency by eliminating extra atomic operations when the locks are
1049 * already in proper order on entry. This favors lower cpu-ids and will
1050 * grant the double lock to lower cpus over higher ids under contention,
1051 * regardless of entry order into the function.
1052 */
1053static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1054 __releases(this_rq->lock)
1055 __acquires(busiest->lock)
1056 __acquires(this_rq->lock)
1057{
1058 int ret = 0;
1059
1060 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1061 if (busiest < this_rq) {
1062 raw_spin_unlock(&this_rq->lock);
1063 raw_spin_lock(&busiest->lock);
1064 raw_spin_lock_nested(&this_rq->lock,
1065 SINGLE_DEPTH_NESTING);
1066 ret = 1;
1067 } else
1068 raw_spin_lock_nested(&busiest->lock,
1069 SINGLE_DEPTH_NESTING);
1070 }
1071 return ret;
1072}
1073
1074#endif /* CONFIG_PREEMPT */
1075
1076/*
1077 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1078 */
1079static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1080{
1081 if (unlikely(!irqs_disabled())) {
1082 /* printk() doesn't work good under rq->lock */
1083 raw_spin_unlock(&this_rq->lock);
1084 BUG_ON(1);
1085 }
1086
1087 return _double_lock_balance(this_rq, busiest);
1088}
1089
1090static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1091 __releases(busiest->lock)
1092{
1093 raw_spin_unlock(&busiest->lock);
1094 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1095}
1096
1097/*
1098 * double_rq_lock - safely lock two runqueues
1099 *
1100 * Note this does not disable interrupts like task_rq_lock,
1101 * you need to do so manually before calling.
1102 */
1103static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1104 __acquires(rq1->lock)
1105 __acquires(rq2->lock)
1106{
1107 BUG_ON(!irqs_disabled());
1108 if (rq1 == rq2) {
1109 raw_spin_lock(&rq1->lock);
1110 __acquire(rq2->lock); /* Fake it out ;) */
1111 } else {
1112 if (rq1 < rq2) {
1113 raw_spin_lock(&rq1->lock);
1114 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1115 } else {
1116 raw_spin_lock(&rq2->lock);
1117 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1118 }
1119 }
1120}
1121
1122/*
1123 * double_rq_unlock - safely unlock two runqueues
1124 *
1125 * Note this does not restore interrupts like task_rq_unlock,
1126 * you need to do so manually after calling.
1127 */
1128static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1129 __releases(rq1->lock)
1130 __releases(rq2->lock)
1131{
1132 raw_spin_unlock(&rq1->lock);
1133 if (rq1 != rq2)
1134 raw_spin_unlock(&rq2->lock);
1135 else
1136 __release(rq2->lock);
1137}
1138
1139#else /* CONFIG_SMP */
1140
1141/*
1142 * double_rq_lock - safely lock two runqueues
1143 *
1144 * Note this does not disable interrupts like task_rq_lock,
1145 * you need to do so manually before calling.
1146 */
1147static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1148 __acquires(rq1->lock)
1149 __acquires(rq2->lock)
1150{
1151 BUG_ON(!irqs_disabled());
1152 BUG_ON(rq1 != rq2);
1153 raw_spin_lock(&rq1->lock);
1154 __acquire(rq2->lock); /* Fake it out ;) */
1155}
1156
1157/*
1158 * double_rq_unlock - safely unlock two runqueues
1159 *
1160 * Note this does not restore interrupts like task_rq_unlock,
1161 * you need to do so manually after calling.
1162 */
1163static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1164 __releases(rq1->lock)
1165 __releases(rq2->lock)
1166{
1167 BUG_ON(rq1 != rq2);
1168 raw_spin_unlock(&rq1->lock);
1169 __release(rq2->lock);
1170}
1171
1172#endif
1173
1174extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1175extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1176extern void print_cfs_stats(struct seq_file *m, int cpu);
1177extern void print_rt_stats(struct seq_file *m, int cpu);
1178
1179extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1180extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1181
1182extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1183
1184#ifdef CONFIG_NO_HZ
1185enum rq_nohz_flag_bits {
1186 NOHZ_TICK_STOPPED,
1187 NOHZ_BALANCE_KICK,
1188 NOHZ_IDLE,
1189};
1190
1191#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1192#endif
1193
1194#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1195
1196DECLARE_PER_CPU(u64, cpu_hardirq_time);
1197DECLARE_PER_CPU(u64, cpu_softirq_time);
1198
1199#ifndef CONFIG_64BIT
1200DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1201
1202static inline void irq_time_write_begin(void)
1203{
1204 __this_cpu_inc(irq_time_seq.sequence);
1205 smp_wmb();
1206}
1207
1208static inline void irq_time_write_end(void)
1209{
1210 smp_wmb();
1211 __this_cpu_inc(irq_time_seq.sequence);
1212}
1213
1214static inline u64 irq_time_read(int cpu)
1215{
1216 u64 irq_time;
1217 unsigned seq;
1218
1219 do {
1220 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1221 irq_time = per_cpu(cpu_softirq_time, cpu) +
1222 per_cpu(cpu_hardirq_time, cpu);
1223 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1224
1225 return irq_time;
1226}
1227#else /* CONFIG_64BIT */
1228static inline void irq_time_write_begin(void)
1229{
1230}
1231
1232static inline void irq_time_write_end(void)
1233{
1234}
1235
1236static inline u64 irq_time_read(int cpu)
1237{
1238 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1239}
1240#endif /* CONFIG_64BIT */
1241#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
deleted file mode 100644
index 903ffa9e887..00000000000
--- a/kernel/sched/stats.c
+++ /dev/null
@@ -1,111 +0,0 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
deleted file mode 100644
index 2ef90a51ec5..00000000000
--- a/kernel/sched/stats.h
+++ /dev/null
@@ -1,231 +0,0 @@
1
2#ifdef CONFIG_SCHEDSTATS
3
4/*
5 * Expects runqueue lock to be held for atomicity of update
6 */
7static inline void
8rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
9{
10 if (rq) {
11 rq->rq_sched_info.run_delay += delta;
12 rq->rq_sched_info.pcount++;
13 }
14}
15
16/*
17 * Expects runqueue lock to be held for atomicity of update
18 */
19static inline void
20rq_sched_info_depart(struct rq *rq, unsigned long long delta)
21{
22 if (rq)
23 rq->rq_cpu_time += delta;
24}
25
26static inline void
27rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
28{
29 if (rq)
30 rq->rq_sched_info.run_delay += delta;
31}
32# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
33# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
34# define schedstat_set(var, val) do { var = (val); } while (0)
35#else /* !CONFIG_SCHEDSTATS */
36static inline void
37rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
38{}
39static inline void
40rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
41{}
42static inline void
43rq_sched_info_depart(struct rq *rq, unsigned long long delta)
44{}
45# define schedstat_inc(rq, field) do { } while (0)
46# define schedstat_add(rq, field, amt) do { } while (0)
47# define schedstat_set(var, val) do { } while (0)
48#endif
49
50#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
51static inline void sched_info_reset_dequeued(struct task_struct *t)
52{
53 t->sched_info.last_queued = 0;
54}
55
56/*
57 * We are interested in knowing how long it was from the *first* time a
58 * task was queued to the time that it finally hit a cpu, we call this routine
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew.
61 */
62static inline void sched_info_dequeued(struct task_struct *t)
63{
64 unsigned long long now = task_rq(t)->clock, delta = 0;
65
66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued)
68 delta = now - t->sched_info.last_queued;
69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta;
71
72 rq_sched_info_dequeued(task_rq(t), delta);
73}
74
75/*
76 * Called when a task finally hits the cpu. We can now calculate how
77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is.
79 */
80static void sched_info_arrive(struct task_struct *t)
81{
82 unsigned long long now = task_rq(t)->clock, delta = 0;
83
84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued;
86 sched_info_reset_dequeued(t);
87 t->sched_info.run_delay += delta;
88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++;
90
91 rq_sched_info_arrive(task_rq(t), delta);
92}
93
94/*
95 * This function is only called from enqueue_task(), but also only updates
96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */
99static inline void sched_info_queued(struct task_struct *t)
100{
101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock;
104}
105
106/*
107 * Called when a process ceases being the active-running process, either
108 * voluntarily or involuntarily. Now we can calculate how long we ran.
109 * Also, if the process is still in the TASK_RUNNING state, call
110 * sched_info_queued() to mark that it has now again started waiting on
111 * the runqueue.
112 */
113static inline void sched_info_depart(struct task_struct *t)
114{
115 unsigned long long delta = task_rq(t)->clock -
116 t->sched_info.last_arrival;
117
118 rq_sched_info_depart(task_rq(t), delta);
119
120 if (t->state == TASK_RUNNING)
121 sched_info_queued(t);
122}
123
124/*
125 * Called when tasks are switched involuntarily due, typically, to expiring
126 * their time slice. (This may also be called when switching to or from
127 * the idle task.) We are only called when prev != next.
128 */
129static inline void
130__sched_info_switch(struct task_struct *prev, struct task_struct *next)
131{
132 struct rq *rq = task_rq(prev);
133
134 /*
135 * prev now departs the cpu. It's not interesting to record
136 * stats about how efficient we were at scheduling the idle
137 * process, however.
138 */
139 if (prev != rq->idle)
140 sched_info_depart(prev);
141
142 if (next != rq->idle)
143 sched_info_arrive(next);
144}
145static inline void
146sched_info_switch(struct task_struct *prev, struct task_struct *next)
147{
148 if (unlikely(sched_info_on()))
149 __sched_info_switch(prev, next);
150}
151#else
152#define sched_info_queued(t) do { } while (0)
153#define sched_info_reset_dequeued(t) do { } while (0)
154#define sched_info_dequeued(t) do { } while (0)
155#define sched_info_switch(t, next) do { } while (0)
156#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
157
158/*
159 * The following are functions that support scheduler-internal time accounting.
160 * These functions are generally called at the timer tick. None of this depends
161 * on CONFIG_SCHEDSTATS.
162 */
163
164/**
165 * account_group_user_time - Maintain utime for a thread group.
166 *
167 * @tsk: Pointer to task structure.
168 * @cputime: Time value by which to increment the utime field of the
169 * thread_group_cputime structure.
170 *
171 * If thread group time is being maintained, get the structure for the
172 * running CPU and update the utime field there.
173 */
174static inline void account_group_user_time(struct task_struct *tsk,
175 cputime_t cputime)
176{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178
179 if (!cputimer->running)
180 return;
181
182 raw_spin_lock(&cputimer->lock);
183 cputimer->cputime.utime += cputime;
184 raw_spin_unlock(&cputimer->lock);
185}
186
187/**
188 * account_group_system_time - Maintain stime for a thread group.
189 *
190 * @tsk: Pointer to task structure.
191 * @cputime: Time value by which to increment the stime field of the
192 * thread_group_cputime structure.
193 *
194 * If thread group time is being maintained, get the structure for the
195 * running CPU and update the stime field there.
196 */
197static inline void account_group_system_time(struct task_struct *tsk,
198 cputime_t cputime)
199{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201
202 if (!cputimer->running)
203 return;
204
205 raw_spin_lock(&cputimer->lock);
206 cputimer->cputime.stime += cputime;
207 raw_spin_unlock(&cputimer->lock);
208}
209
210/**
211 * account_group_exec_runtime - Maintain exec runtime for a thread group.
212 *
213 * @tsk: Pointer to task structure.
214 * @ns: Time value by which to increment the sum_exec_runtime field
215 * of the thread_group_cputime structure.
216 *
217 * If thread group time is being maintained, get the structure for the
218 * running CPU and update the sum_exec_runtime field there.
219 */
220static inline void account_group_exec_runtime(struct task_struct *tsk,
221 unsigned long long ns)
222{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224
225 if (!cputimer->running)
226 return;
227
228 raw_spin_lock(&cputimer->lock);
229 cputimer->cputime.sum_exec_runtime += ns;
230 raw_spin_unlock(&cputimer->lock);
231}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
deleted file mode 100644
index da5eb5bed84..00000000000
--- a/kernel/sched/stop_task.c
+++ /dev/null
@@ -1,128 +0,0 @@
1#include "sched.h"
2
3/*
4 * stop-task scheduling class.
5 *
6 * The stop task is the highest priority task in the system, it preempts
7 * everything and will be preempted by nothing.
8 *
9 * See kernel/stop_machine.c
10 */
11
12#ifdef CONFIG_SMP
13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
15{
16 return task_cpu(p); /* stop tasks as never migrate */
17}
18#endif /* CONFIG_SMP */
19
20static void
21check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
22{
23 /* we're never preempted */
24}
25
26static struct task_struct *pick_next_task_stop(struct rq *rq)
27{
28 struct task_struct *stop = rq->stop;
29
30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task;
32 return stop;
33 }
34
35 return NULL;
36}
37
38static void
39enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
40{
41 inc_nr_running(rq);
42}
43
44static void
45dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
46{
47 dec_nr_running(rq);
48}
49
50static void yield_task_stop(struct rq *rq)
51{
52 BUG(); /* the stop task should never yield, its pointless. */
53}
54
55static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
56{
57 struct task_struct *curr = rq->curr;
58 u64 delta_exec;
59
60 delta_exec = rq->clock_task - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0;
63
64 schedstat_set(curr->se.statistics.exec_max,
65 max(curr->se.statistics.exec_max, delta_exec));
66
67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec);
69
70 curr->se.exec_start = rq->clock_task;
71 cpuacct_charge(curr, delta_exec);
72}
73
74static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
75{
76}
77
78static void set_curr_task_stop(struct rq *rq)
79{
80 struct task_struct *stop = rq->stop;
81
82 stop->se.exec_start = rq->clock_task;
83}
84
85static void switched_to_stop(struct rq *rq, struct task_struct *p)
86{
87 BUG(); /* its impossible to change to this class */
88}
89
90static void
91prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
92{
93 BUG(); /* how!?, what priority? */
94}
95
96static unsigned int
97get_rr_interval_stop(struct rq *rq, struct task_struct *task)
98{
99 return 0;
100}
101
102/*
103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */
105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class,
107
108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop,
110 .yield_task = yield_task_stop,
111
112 .check_preempt_curr = check_preempt_curr_stop,
113
114 .pick_next_task = pick_next_task_stop,
115 .put_prev_task = put_prev_task_stop,
116
117#ifdef CONFIG_SMP
118 .select_task_rq = select_task_rq_stop,
119#endif
120
121 .set_curr_task = set_curr_task_stop,
122 .task_tick = task_tick_stop,
123
124 .get_rr_interval = get_rr_interval_stop,
125
126 .prio_changed = prio_changed_stop,
127 .switched_to = switched_to_stop,
128};
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b59377..57d4b13b631 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,357 +3,15 @@
3 * 3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 * 5 *
6 * Copyright (C) 2012 Google, Inc. 6 * This defines a simple but solid secure-computing mode.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
14 */ 7 */
15 8
16#include <linux/atomic.h>
17#include <linux/audit.h>
18#include <linux/compat.h>
19#include <linux/sched.h>
20#include <linux/seccomp.h> 9#include <linux/seccomp.h>
10#include <linux/sched.h>
11#include <linux/compat.h>
21 12
22/* #define SECCOMP_DEBUG 1 */ 13/* #define SECCOMP_DEBUG 1 */
23 14#define NR_SECCOMP_MODES 1
24#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h>
26#include <linux/filter.h>
27#include <linux/ptrace.h>
28#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h>
31#include <linux/uaccess.h>
32
33/**
34 * struct seccomp_filter - container for seccomp BPF programs
35 *
36 * @usage: reference count to manage the object lifetime.
37 * get/put helpers should be used when accessing an instance
38 * outside of a lifetime-guarded section. In general, this
39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate
43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting
46 * with current->seccomp.filter, the most recently attached or inherited filter.
47 * However, multiple filters may share a @prev node, by way of fork(), which
48 * results in a unidirectional tree existing in memory. This is similar to
49 * how namespaces work.
50 *
51 * seccomp_filter objects should never be modified after being attached
52 * to a task_struct (other than @usage).
53 */
54struct seccomp_filter {
55 atomic_t usage;
56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[];
59};
60
61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63
64/**
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture.
76 */
77static inline u32 get_u32(u64 data, int index)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112}
113
114/**
115 * seccomp_check_filter - verify seccomp filter code
116 * @filter: filter to verify
117 * @flen: length of filter
118 *
119 * Takes a previously checked filter (by sk_chk_filter) and
120 * redirects all filter code that loads struct sk_buff data
121 * and related data through seccomp_bpf_load. It also
122 * enforces length and alignment checking of those loads.
123 *
124 * Returns 0 if the rule set is legal or -EINVAL if not.
125 */
126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
127{
128 int pc;
129 for (pc = 0; pc < flen; pc++) {
130 struct sock_filter *ftest = &filter[pc];
131 u16 code = ftest->code;
132 u32 k = ftest->k;
133
134 switch (code) {
135 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W;
137 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL;
140 continue;
141 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM;
143 ftest->k = sizeof(struct seccomp_data);
144 continue;
145 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM;
147 ftest->k = sizeof(struct seccomp_data);
148 continue;
149 /* Explicitly include allowed calls. */
150 case BPF_S_RET_K:
151 case BPF_S_RET_A:
152 case BPF_S_ALU_ADD_K:
153 case BPF_S_ALU_ADD_X:
154 case BPF_S_ALU_SUB_K:
155 case BPF_S_ALU_SUB_X:
156 case BPF_S_ALU_MUL_K:
157 case BPF_S_ALU_MUL_X:
158 case BPF_S_ALU_DIV_X:
159 case BPF_S_ALU_AND_K:
160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K:
166 case BPF_S_ALU_RSH_X:
167 case BPF_S_ALU_NEG:
168 case BPF_S_LD_IMM:
169 case BPF_S_LDX_IMM:
170 case BPF_S_MISC_TAX:
171 case BPF_S_MISC_TXA:
172 case BPF_S_ALU_DIV_K:
173 case BPF_S_LD_MEM:
174 case BPF_S_LDX_MEM:
175 case BPF_S_ST:
176 case BPF_S_STX:
177 case BPF_S_JMP_JA:
178 case BPF_S_JMP_JEQ_K:
179 case BPF_S_JMP_JEQ_X:
180 case BPF_S_JMP_JGE_K:
181 case BPF_S_JMP_JGE_X:
182 case BPF_S_JMP_JGT_K:
183 case BPF_S_JMP_JGT_X:
184 case BPF_S_JMP_JSET_K:
185 case BPF_S_JMP_JSET_X:
186 continue;
187 default:
188 return -EINVAL;
189 }
190 }
191 return 0;
192}
193
194/**
195 * seccomp_run_filters - evaluates all seccomp filters against @syscall
196 * @syscall: number of the current system call
197 *
198 * Returns valid seccomp BPF response codes.
199 */
200static u32 seccomp_run_filters(int syscall)
201{
202 struct seccomp_filter *f;
203 u32 ret = SECCOMP_RET_ALLOW;
204
205 /* Ensure unexpected behavior doesn't result in failing open. */
206 if (WARN_ON(current->seccomp.filter == NULL))
207 return SECCOMP_RET_KILL;
208
209 /*
210 * All filters in the list are evaluated and the lowest BPF return
211 * value always takes priority (ignoring the DATA).
212 */
213 for (f = current->seccomp.filter; f; f = f->prev) {
214 u32 cur_ret = sk_run_filter(NULL, f->insns);
215 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
216 ret = cur_ret;
217 }
218 return ret;
219}
220
221/**
222 * seccomp_attach_filter: Attaches a seccomp filter to current.
223 * @fprog: BPF program to install
224 *
225 * Returns 0 on success or an errno on failure.
226 */
227static long seccomp_attach_filter(struct sock_fprog *fprog)
228{
229 struct seccomp_filter *filter;
230 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
231 unsigned long total_insns = fprog->len;
232 long ret;
233
234 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
235 return -EINVAL;
236
237 for (filter = current->seccomp.filter; filter; filter = filter->prev)
238 total_insns += filter->len + 4; /* include a 4 instr penalty */
239 if (total_insns > MAX_INSNS_PER_PATH)
240 return -ENOMEM;
241
242 /*
243 * Installing a seccomp filter requires that the task have
244 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
245 * This avoids scenarios where unprivileged tasks can affect the
246 * behavior of privileged children.
247 */
248 if (!current->no_new_privs &&
249 security_capable_noaudit(current_cred(), current_user_ns(),
250 CAP_SYS_ADMIN) != 0)
251 return -EACCES;
252
253 /* Allocate a new seccomp_filter */
254 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
255 GFP_KERNEL|__GFP_NOWARN);
256 if (!filter)
257 return -ENOMEM;
258 atomic_set(&filter->usage, 1);
259 filter->len = fprog->len;
260
261 /* Copy the instructions from fprog. */
262 ret = -EFAULT;
263 if (copy_from_user(filter->insns, fprog->filter, fp_size))
264 goto fail;
265
266 /* Check and rewrite the fprog via the skb checker */
267 ret = sk_chk_filter(filter->insns, filter->len);
268 if (ret)
269 goto fail;
270
271 /* Check and rewrite the fprog for seccomp use */
272 ret = seccomp_check_filter(filter->insns, filter->len);
273 if (ret)
274 goto fail;
275
276 /*
277 * If there is an existing filter, make it the prev and don't drop its
278 * task reference.
279 */
280 filter->prev = current->seccomp.filter;
281 current->seccomp.filter = filter;
282 return 0;
283fail:
284 kfree(filter);
285 return ret;
286}
287
288/**
289 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
290 * @user_filter: pointer to the user data containing a sock_fprog.
291 *
292 * Returns 0 on success and non-zero otherwise.
293 */
294long seccomp_attach_user_filter(char __user *user_filter)
295{
296 struct sock_fprog fprog;
297 long ret = -EFAULT;
298
299#ifdef CONFIG_COMPAT
300 if (is_compat_task()) {
301 struct compat_sock_fprog fprog32;
302 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
303 goto out;
304 fprog.len = fprog32.len;
305 fprog.filter = compat_ptr(fprog32.filter);
306 } else /* falls through to the if below. */
307#endif
308 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
309 goto out;
310 ret = seccomp_attach_filter(&fprog);
311out:
312 return ret;
313}
314
315/* get_seccomp_filter - increments the reference count of the filter on @tsk */
316void get_seccomp_filter(struct task_struct *tsk)
317{
318 struct seccomp_filter *orig = tsk->seccomp.filter;
319 if (!orig)
320 return;
321 /* Reference count is bounded by the number of total processes. */
322 atomic_inc(&orig->usage);
323}
324
325/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
326void put_seccomp_filter(struct task_struct *tsk)
327{
328 struct seccomp_filter *orig = tsk->seccomp.filter;
329 /* Clean up single-reference branches iteratively. */
330 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig;
332 orig = orig->prev;
333 kfree(freeme);
334 }
335}
336
337/**
338 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
339 * @syscall: syscall number to send to userland
340 * @reason: filter-supplied reason code to send to userland (via si_errno)
341 *
342 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
343 */
344static void seccomp_send_sigsys(int syscall, int reason)
345{
346 struct siginfo info;
347 memset(&info, 0, sizeof(info));
348 info.si_signo = SIGSYS;
349 info.si_code = SYS_SECCOMP;
350 info.si_call_addr = (void __user *)KSTK_EIP(current);
351 info.si_errno = reason;
352 info.si_arch = syscall_get_arch(current, task_pt_regs(current));
353 info.si_syscall = syscall;
354 force_sig_info(SIGSYS, &info, current);
355}
356#endif /* CONFIG_SECCOMP_FILTER */
357 15
358/* 16/*
359 * Secure computing mode 1 allows only read/write/exit/sigreturn. 17 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -372,15 +30,13 @@ static int mode1_syscalls_32[] = {
372}; 30};
373#endif 31#endif
374 32
375int __secure_computing(int this_syscall) 33void __secure_computing(int this_syscall)
376{ 34{
377 int mode = current->seccomp.mode; 35 int mode = current->seccomp.mode;
378 int exit_sig = 0; 36 int * syscall;
379 int *syscall;
380 u32 ret;
381 37
382 switch (mode) { 38 switch (mode) {
383 case SECCOMP_MODE_STRICT: 39 case 1:
384 syscall = mode1_syscalls; 40 syscall = mode1_syscalls;
385#ifdef CONFIG_COMPAT 41#ifdef CONFIG_COMPAT
386 if (is_compat_task()) 42 if (is_compat_task())
@@ -388,61 +44,9 @@ int __secure_computing(int this_syscall)
388#endif 44#endif
389 do { 45 do {
390 if (*syscall == this_syscall) 46 if (*syscall == this_syscall)
391 return 0; 47 return;
392 } while (*++syscall); 48 } while (*++syscall);
393 exit_sig = SIGKILL;
394 ret = SECCOMP_RET_KILL;
395 break;
396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: {
398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
400 ret = seccomp_run_filters(this_syscall);
401 data = ret & SECCOMP_RET_DATA;
402 ret &= SECCOMP_RET_ACTION;
403 switch (ret) {
404 case SECCOMP_RET_ERRNO:
405 /* Set the low-order 16-bits as a errno. */
406 syscall_set_return_value(current, regs,
407 -data, 0);
408 goto skip;
409 case SECCOMP_RET_TRAP:
410 /* Show the handler the original registers. */
411 syscall_rollback(current, regs);
412 /* Let the filter pass back 16 bits of data. */
413 seccomp_send_sigsys(this_syscall, data);
414 goto skip;
415 case SECCOMP_RET_TRACE:
416 /* Skip these calls if there is no tracer. */
417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
420 goto skip;
421 }
422 /* Allow the BPF to provide the event message */
423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
424 /*
425 * The delivery of a fatal signal during event
426 * notification may silently skip tracer notification.
427 * Terminating the task now avoids executing a system
428 * call that may not be intended.
429 */
430 if (fatal_signal_pending(current))
431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
435 return 0;
436 case SECCOMP_RET_ALLOW:
437 return 0;
438 case SECCOMP_RET_KILL:
439 default:
440 break;
441 }
442 exit_sig = SIGSYS;
443 break; 49 break;
444 }
445#endif
446 default: 50 default:
447 BUG(); 51 BUG();
448 } 52 }
@@ -450,13 +54,7 @@ int __secure_computing(int this_syscall)
450#ifdef SECCOMP_DEBUG 54#ifdef SECCOMP_DEBUG
451 dump_stack(); 55 dump_stack();
452#endif 56#endif
453 audit_seccomp(this_syscall, exit_sig, ret); 57 do_exit(SIGKILL);
454 do_exit(exit_sig);
455#ifdef CONFIG_SECCOMP_FILTER
456skip:
457 audit_seccomp(this_syscall, exit_sig, ret);
458#endif
459 return -1;
460} 58}
461 59
462long prctl_get_seccomp(void) 60long prctl_get_seccomp(void)
@@ -464,48 +62,25 @@ long prctl_get_seccomp(void)
464 return current->seccomp.mode; 62 return current->seccomp.mode;
465} 63}
466 64
467/** 65long prctl_set_seccomp(unsigned long seccomp_mode)
468 * prctl_set_seccomp: configures current->seccomp.mode
469 * @seccomp_mode: requested mode to use
470 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
471 *
472 * This function may be called repeatedly with a @seccomp_mode of
473 * SECCOMP_MODE_FILTER to install additional filters. Every filter
474 * successfully installed will be evaluated (in reverse order) for each system
475 * call the task makes.
476 *
477 * Once current->seccomp.mode is non-zero, it may not be changed.
478 *
479 * Returns 0 on success or -EINVAL on failure.
480 */
481long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
482{ 66{
483 long ret = -EINVAL; 67 long ret;
484 68
485 if (current->seccomp.mode && 69 /* can set it only once to be even more secure */
486 current->seccomp.mode != seccomp_mode) 70 ret = -EPERM;
71 if (unlikely(current->seccomp.mode))
487 goto out; 72 goto out;
488 73
489 switch (seccomp_mode) { 74 ret = -EINVAL;
490 case SECCOMP_MODE_STRICT: 75 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
491 ret = 0; 76 current->seccomp.mode = seccomp_mode;
77 set_thread_flag(TIF_SECCOMP);
492#ifdef TIF_NOTSC 78#ifdef TIF_NOTSC
493 disable_TSC(); 79 disable_TSC();
494#endif 80#endif
495 break; 81 ret = 0;
496#ifdef CONFIG_SECCOMP_FILTER
497 case SECCOMP_MODE_FILTER:
498 ret = seccomp_attach_user_filter(filter);
499 if (ret)
500 goto out;
501 break;
502#endif
503 default:
504 goto out;
505 } 82 }
506 83
507 current->seccomp.mode = seccomp_mode; 84 out:
508 set_thread_flag(TIF_SECCOMP);
509out:
510 return ret; 85 return ret;
511} 86}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe..94a62c0d4ad 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
27 27
28#include <linux/compiler.h> 28#include <linux/compiler.h>
29#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/export.h> 30#include <linux/module.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
54{ 54{
55 unsigned long flags; 55 unsigned long flags;
56 56
57 raw_spin_lock_irqsave(&sem->lock, flags); 57 spin_lock_irqsave(&sem->lock, flags);
58 if (likely(sem->count > 0)) 58 if (likely(sem->count > 0))
59 sem->count--; 59 sem->count--;
60 else 60 else
61 __down(sem); 61 __down(sem);
62 raw_spin_unlock_irqrestore(&sem->lock, flags); 62 spin_unlock_irqrestore(&sem->lock, flags);
63} 63}
64EXPORT_SYMBOL(down); 64EXPORT_SYMBOL(down);
65 65
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
77 unsigned long flags; 77 unsigned long flags;
78 int result = 0; 78 int result = 0;
79 79
80 raw_spin_lock_irqsave(&sem->lock, flags); 80 spin_lock_irqsave(&sem->lock, flags);
81 if (likely(sem->count > 0)) 81 if (likely(sem->count > 0))
82 sem->count--; 82 sem->count--;
83 else 83 else
84 result = __down_interruptible(sem); 84 result = __down_interruptible(sem);
85 raw_spin_unlock_irqrestore(&sem->lock, flags); 85 spin_unlock_irqrestore(&sem->lock, flags);
86 86
87 return result; 87 return result;
88} 88}
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
103 unsigned long flags; 103 unsigned long flags;
104 int result = 0; 104 int result = 0;
105 105
106 raw_spin_lock_irqsave(&sem->lock, flags); 106 spin_lock_irqsave(&sem->lock, flags);
107 if (likely(sem->count > 0)) 107 if (likely(sem->count > 0))
108 sem->count--; 108 sem->count--;
109 else 109 else
110 result = __down_killable(sem); 110 result = __down_killable(sem);
111 raw_spin_unlock_irqrestore(&sem->lock, flags); 111 spin_unlock_irqrestore(&sem->lock, flags);
112 112
113 return result; 113 return result;
114} 114}
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
118 * down_trylock - try to acquire the semaphore, without waiting 118 * down_trylock - try to acquire the semaphore, without waiting
119 * @sem: the semaphore to be acquired 119 * @sem: the semaphore to be acquired
120 * 120 *
121 * Try to acquire the semaphore atomically. Returns 0 if the semaphore has 121 * Try to acquire the semaphore atomically. Returns 0 if the mutex has
122 * been acquired successfully or 1 if it it cannot be acquired. 122 * been acquired successfully or 1 if it it cannot be acquired.
123 * 123 *
124 * NOTE: This return value is inverted from both spin_trylock and 124 * NOTE: This return value is inverted from both spin_trylock and
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
132 unsigned long flags; 132 unsigned long flags;
133 int count; 133 int count;
134 134
135 raw_spin_lock_irqsave(&sem->lock, flags); 135 spin_lock_irqsave(&sem->lock, flags);
136 count = sem->count - 1; 136 count = sem->count - 1;
137 if (likely(count >= 0)) 137 if (likely(count >= 0))
138 sem->count = count; 138 sem->count = count;
139 raw_spin_unlock_irqrestore(&sem->lock, flags); 139 spin_unlock_irqrestore(&sem->lock, flags);
140 140
141 return (count < 0); 141 return (count < 0);
142} 142}
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
159 159
160 raw_spin_lock_irqsave(&sem->lock, flags); 160 spin_lock_irqsave(&sem->lock, flags);
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, jiffies);
165 raw_spin_unlock_irqrestore(&sem->lock, flags); 165 spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
168} 168}
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
179{ 179{
180 unsigned long flags; 180 unsigned long flags;
181 181
182 raw_spin_lock_irqsave(&sem->lock, flags); 182 spin_lock_irqsave(&sem->lock, flags);
183 if (likely(list_empty(&sem->wait_list))) 183 if (likely(list_empty(&sem->wait_list)))
184 sem->count++; 184 sem->count++;
185 else 185 else
186 __up(sem); 186 __up(sem);
187 raw_spin_unlock_irqrestore(&sem->lock, flags); 187 spin_unlock_irqrestore(&sem->lock, flags);
188} 188}
189EXPORT_SYMBOL(up); 189EXPORT_SYMBOL(up);
190 190
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
217 if (timeout <= 0) 217 if (timeout <= 0)
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 raw_spin_unlock_irq(&sem->lock); 220 spin_unlock_irq(&sem->lock);
221 timeout = schedule_timeout(timeout); 221 timeout = schedule_timeout(timeout);
222 raw_spin_lock_irq(&sem->lock); 222 spin_lock_irq(&sem->lock);
223 if (waiter.up) 223 if (waiter.up)
224 return 0; 224 return 0;
225 } 225 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 372771e948c..195331c56ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,13 +11,12 @@
11 */ 11 */
12 12
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/export.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/tty.h> 18#include <linux/tty.h>
19#include <linux/binfmts.h> 19#include <linux/binfmts.h>
20#include <linux/coredump.h>
21#include <linux/security.h> 20#include <linux/security.h>
22#include <linux/syscalls.h> 21#include <linux/syscalls.h>
23#include <linux/ptrace.h> 22#include <linux/ptrace.h>
@@ -29,9 +28,6 @@
29#include <linux/freezer.h> 28#include <linux/freezer.h>
30#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
31#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
32#include <linux/user_namespace.h>
33#include <linux/uprobes.h>
34#include <linux/compat.h>
35#define CREATE_TRACE_POINTS 31#define CREATE_TRACE_POINTS
36#include <trace/events/signal.h> 32#include <trace/events/signal.h>
37 33
@@ -39,7 +35,6 @@
39#include <asm/uaccess.h> 35#include <asm/uaccess.h>
40#include <asm/unistd.h> 36#include <asm/unistd.h>
41#include <asm/siginfo.h> 37#include <asm/siginfo.h>
42#include <asm/cacheflush.h>
43#include "audit.h" /* audit_signal_info() */ 38#include "audit.h" /* audit_signal_info() */
44 39
45/* 40/*
@@ -62,20 +57,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
62 (handler == SIG_DFL && sig_kernel_ignore(sig)); 57 (handler == SIG_DFL && sig_kernel_ignore(sig));
63} 58}
64 59
65static int sig_task_ignored(struct task_struct *t, int sig, bool force) 60static int sig_task_ignored(struct task_struct *t, int sig,
61 int from_ancestor_ns)
66{ 62{
67 void __user *handler; 63 void __user *handler;
68 64
69 handler = sig_handler(t, sig); 65 handler = sig_handler(t, sig);
70 66
71 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && 67 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
72 handler == SIG_DFL && !force) 68 handler == SIG_DFL && !from_ancestor_ns)
73 return 1; 69 return 1;
74 70
75 return sig_handler_ignored(handler, sig); 71 return sig_handler_ignored(handler, sig);
76} 72}
77 73
78static int sig_ignored(struct task_struct *t, int sig, bool force) 74static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
79{ 75{
80 /* 76 /*
81 * Blocked signals are never ignored, since the 77 * Blocked signals are never ignored, since the
@@ -85,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
85 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 81 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
86 return 0; 82 return 0;
87 83
88 if (!sig_task_ignored(t, sig, force)) 84 if (!sig_task_ignored(t, sig, from_ancestor_ns))
89 return 0; 85 return 0;
90 86
91 /* 87 /*
@@ -163,7 +159,7 @@ void recalc_sigpending(void)
163 159
164#define SYNCHRONOUS_MASK \ 160#define SYNCHRONOUS_MASK \
165 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ 161 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
166 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) 162 sigmask(SIGTRAP) | sigmask(SIGFPE))
167 163
168int next_signal(struct sigpending *pending, sigset_t *mask) 164int next_signal(struct sigpending *pending, sigset_t *mask)
169{ 165{
@@ -770,13 +766,14 @@ static int kill_ok_by_cred(struct task_struct *t)
770 const struct cred *cred = current_cred(); 766 const struct cred *cred = current_cred();
771 const struct cred *tcred = __task_cred(t); 767 const struct cred *tcred = __task_cred(t);
772 768
773 if (uid_eq(cred->euid, tcred->suid) || 769 if (cred->user->user_ns == tcred->user->user_ns &&
774 uid_eq(cred->euid, tcred->uid) || 770 (cred->euid == tcred->suid ||
775 uid_eq(cred->uid, tcred->suid) || 771 cred->euid == tcred->uid ||
776 uid_eq(cred->uid, tcred->uid)) 772 cred->uid == tcred->suid ||
773 cred->uid == tcred->uid))
777 return 1; 774 return 1;
778 775
779 if (ns_capable(tcred->user_ns, CAP_KILL)) 776 if (ns_capable(tcred->user->user_ns, CAP_KILL))
780 return 1; 777 return 1;
781 778
782 return 0; 779 return 0;
@@ -857,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)
857 * Returns true if the signal should be actually delivered, otherwise 854 * Returns true if the signal should be actually delivered, otherwise
858 * it should be dropped. 855 * it should be dropped.
859 */ 856 */
860static int prepare_signal(int sig, struct task_struct *p, bool force) 857static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
861{ 858{
862 struct signal_struct *signal = p->signal; 859 struct signal_struct *signal = p->signal;
863 struct task_struct *t; 860 struct task_struct *t;
@@ -917,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, bool force)
917 } 914 }
918 } 915 }
919 916
920 return !sig_ignored(p, sig, force); 917 return !sig_ignored(p, sig, from_ancestor_ns);
921} 918}
922 919
923/* 920/*
@@ -1022,41 +1019,19 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1022 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1019 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1023} 1020}
1024 1021
1025#ifdef CONFIG_USER_NS
1026static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1027{
1028 if (current_user_ns() == task_cred_xxx(t, user_ns))
1029 return;
1030
1031 if (SI_FROMKERNEL(info))
1032 return;
1033
1034 rcu_read_lock();
1035 info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
1036 make_kuid(current_user_ns(), info->si_uid));
1037 rcu_read_unlock();
1038}
1039#else
1040static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1041{
1042 return;
1043}
1044#endif
1045
1046static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, 1022static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1047 int group, int from_ancestor_ns) 1023 int group, int from_ancestor_ns)
1048{ 1024{
1049 struct sigpending *pending; 1025 struct sigpending *pending;
1050 struct sigqueue *q; 1026 struct sigqueue *q;
1051 int override_rlimit; 1027 int override_rlimit;
1052 int ret = 0, result; 1028
1029 trace_signal_generate(sig, info, t);
1053 1030
1054 assert_spin_locked(&t->sighand->siglock); 1031 assert_spin_locked(&t->sighand->siglock);
1055 1032
1056 result = TRACE_SIGNAL_IGNORED; 1033 if (!prepare_signal(sig, t, from_ancestor_ns))
1057 if (!prepare_signal(sig, t, 1034 return 0;
1058 from_ancestor_ns || (info == SEND_SIG_FORCED)))
1059 goto ret;
1060 1035
1061 pending = group ? &t->signal->shared_pending : &t->pending; 1036 pending = group ? &t->signal->shared_pending : &t->pending;
1062 /* 1037 /*
@@ -1064,11 +1039,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1064 * exactly one non-rt signal, so that we can get more 1039 * exactly one non-rt signal, so that we can get more
1065 * detailed information about the cause of the signal. 1040 * detailed information about the cause of the signal.
1066 */ 1041 */
1067 result = TRACE_SIGNAL_ALREADY_PENDING;
1068 if (legacy_queue(pending, sig)) 1042 if (legacy_queue(pending, sig))
1069 goto ret; 1043 return 0;
1070
1071 result = TRACE_SIGNAL_DELIVERED;
1072 /* 1044 /*
1073 * fast-pathed signals for kernel-internal things like SIGSTOP 1045 * fast-pathed signals for kernel-internal things like SIGSTOP
1074 * or SIGKILL. 1046 * or SIGKILL.
@@ -1101,7 +1073,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1101 q->info.si_code = SI_USER; 1073 q->info.si_code = SI_USER;
1102 q->info.si_pid = task_tgid_nr_ns(current, 1074 q->info.si_pid = task_tgid_nr_ns(current,
1103 task_active_pid_ns(t)); 1075 task_active_pid_ns(t));
1104 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 1076 q->info.si_uid = current_uid();
1105 break; 1077 break;
1106 case (unsigned long) SEND_SIG_PRIV: 1078 case (unsigned long) SEND_SIG_PRIV:
1107 q->info.si_signo = sig; 1079 q->info.si_signo = sig;
@@ -1116,9 +1088,6 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1116 q->info.si_pid = 0; 1088 q->info.si_pid = 0;
1117 break; 1089 break;
1118 } 1090 }
1119
1120 userns_fixup_signal_uid(&q->info, t);
1121
1122 } else if (!is_si_special(info)) { 1091 } else if (!is_si_special(info)) {
1123 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1092 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
1124 /* 1093 /*
@@ -1126,15 +1095,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1126 * signal was rt and sent by user using something 1095 * signal was rt and sent by user using something
1127 * other than kill(). 1096 * other than kill().
1128 */ 1097 */
1129 result = TRACE_SIGNAL_OVERFLOW_FAIL; 1098 trace_signal_overflow_fail(sig, group, info);
1130 ret = -EAGAIN; 1099 return -EAGAIN;
1131 goto ret;
1132 } else { 1100 } else {
1133 /* 1101 /*
1134 * This is a silent loss of information. We still 1102 * This is a silent loss of information. We still
1135 * send the signal, but the *info bits are lost. 1103 * send the signal, but the *info bits are lost.
1136 */ 1104 */
1137 result = TRACE_SIGNAL_LOSE_INFO; 1105 trace_signal_lose_info(sig, group, info);
1138 } 1106 }
1139 } 1107 }
1140 1108
@@ -1142,9 +1110,7 @@ out_set:
1142 signalfd_notify(t, sig); 1110 signalfd_notify(t, sig);
1143 sigaddset(&pending->signal, sig); 1111 sigaddset(&pending->signal, sig);
1144 complete_signal(sig, t, group); 1112 complete_signal(sig, t, group);
1145ret: 1113 return 0;
1146 trace_signal_generate(sig, info, t, group, result);
1147 return ret;
1148} 1114}
1149 1115
1150static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 1116static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1160,9 +1126,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1160 return __send_signal(sig, info, t, group, from_ancestor_ns); 1126 return __send_signal(sig, info, t, group, from_ancestor_ns);
1161} 1127}
1162 1128
1163static void print_fatal_signal(int signr) 1129static void print_fatal_signal(struct pt_regs *regs, int signr)
1164{ 1130{
1165 struct pt_regs *regs = signal_pt_regs();
1166 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1131 printk("%s/%d: potentially unexpected fatal signal %d.\n",
1167 current->comm, task_pid_nr(current), signr); 1132 current->comm, task_pid_nr(current), signr);
1168 1133
@@ -1379,22 +1344,13 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1379 return error; 1344 return error;
1380} 1345}
1381 1346
1382static int kill_as_cred_perm(const struct cred *cred,
1383 struct task_struct *target)
1384{
1385 const struct cred *pcred = __task_cred(target);
1386 if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
1387 !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
1388 return 0;
1389 return 1;
1390}
1391
1392/* like kill_pid_info(), but doesn't use uid/euid of "current" */ 1347/* like kill_pid_info(), but doesn't use uid/euid of "current" */
1393int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, 1348int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1394 const struct cred *cred, u32 secid) 1349 uid_t uid, uid_t euid, u32 secid)
1395{ 1350{
1396 int ret = -EINVAL; 1351 int ret = -EINVAL;
1397 struct task_struct *p; 1352 struct task_struct *p;
1353 const struct cred *pcred;
1398 unsigned long flags; 1354 unsigned long flags;
1399 1355
1400 if (!valid_signal(sig)) 1356 if (!valid_signal(sig))
@@ -1406,7 +1362,10 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
1406 ret = -ESRCH; 1362 ret = -ESRCH;
1407 goto out_unlock; 1363 goto out_unlock;
1408 } 1364 }
1409 if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { 1365 pcred = __task_cred(p);
1366 if (si_fromuser(info) &&
1367 euid != pcred->suid && euid != pcred->uid &&
1368 uid != pcred->suid && uid != pcred->uid) {
1410 ret = -EPERM; 1369 ret = -EPERM;
1411 goto out_unlock; 1370 goto out_unlock;
1412 } 1371 }
@@ -1425,7 +1384,7 @@ out_unlock:
1425 rcu_read_unlock(); 1384 rcu_read_unlock();
1426 return ret; 1385 return ret;
1427} 1386}
1428EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); 1387EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1429 1388
1430/* 1389/*
1431 * kill_something_info() interprets pid in interesting ways just like kill(2). 1390 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1586,7 +1545,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1586 int sig = q->info.si_signo; 1545 int sig = q->info.si_signo;
1587 struct sigpending *pending; 1546 struct sigpending *pending;
1588 unsigned long flags; 1547 unsigned long flags;
1589 int ret, result; 1548 int ret;
1590 1549
1591 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1550 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1592 1551
@@ -1595,8 +1554,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1595 goto ret; 1554 goto ret;
1596 1555
1597 ret = 1; /* the signal is ignored */ 1556 ret = 1; /* the signal is ignored */
1598 result = TRACE_SIGNAL_IGNORED; 1557 if (!prepare_signal(sig, t, 0))
1599 if (!prepare_signal(sig, t, false))
1600 goto out; 1558 goto out;
1601 1559
1602 ret = 0; 1560 ret = 0;
@@ -1607,7 +1565,6 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1607 */ 1565 */
1608 BUG_ON(q->info.si_code != SI_TIMER); 1566 BUG_ON(q->info.si_code != SI_TIMER);
1609 q->info.si_overrun++; 1567 q->info.si_overrun++;
1610 result = TRACE_SIGNAL_ALREADY_PENDING;
1611 goto out; 1568 goto out;
1612 } 1569 }
1613 q->info.si_overrun = 0; 1570 q->info.si_overrun = 0;
@@ -1617,9 +1574,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1617 list_add_tail(&q->list, &pending->list); 1574 list_add_tail(&q->list, &pending->list);
1618 sigaddset(&pending->signal, sig); 1575 sigaddset(&pending->signal, sig);
1619 complete_signal(sig, t, group); 1576 complete_signal(sig, t, group);
1620 result = TRACE_SIGNAL_DELIVERED;
1621out: 1577out:
1622 trace_signal_generate(sig, &q->info, t, group, result);
1623 unlock_task_sighand(t, &flags); 1578 unlock_task_sighand(t, &flags);
1624ret: 1579ret:
1625 return ret; 1580 return ret;
@@ -1647,36 +1602,29 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1647 BUG_ON(!tsk->ptrace && 1602 BUG_ON(!tsk->ptrace &&
1648 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1603 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1649 1604
1650 if (sig != SIGCHLD) {
1651 /*
1652 * This is only possible if parent == real_parent.
1653 * Check if it has changed security domain.
1654 */
1655 if (tsk->parent_exec_id != tsk->parent->self_exec_id)
1656 sig = SIGCHLD;
1657 }
1658
1659 info.si_signo = sig; 1605 info.si_signo = sig;
1660 info.si_errno = 0; 1606 info.si_errno = 0;
1661 /* 1607 /*
1662 * We are under tasklist_lock here so our parent is tied to 1608 * we are under tasklist_lock here so our parent is tied to
1663 * us and cannot change. 1609 * us and cannot exit and release its namespace.
1664 * 1610 *
1665 * task_active_pid_ns will always return the same pid namespace 1611 * the only it can is to switch its nsproxy with sys_unshare,
1666 * until a task passes through release_task. 1612 * bu uncharing pid namespaces is not allowed, so we'll always
1613 * see relevant namespace
1667 * 1614 *
1668 * write_lock() currently calls preempt_disable() which is the 1615 * write_lock() currently calls preempt_disable() which is the
1669 * same as rcu_read_lock(), but according to Oleg, this is not 1616 * same as rcu_read_lock(), but according to Oleg, this is not
1670 * correct to rely on this 1617 * correct to rely on this
1671 */ 1618 */
1672 rcu_read_lock(); 1619 rcu_read_lock();
1673 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); 1620 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1674 info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), 1621 info.si_uid = __task_cred(tsk)->uid;
1675 task_uid(tsk));
1676 rcu_read_unlock(); 1622 rcu_read_unlock();
1677 1623
1678 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1624 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1679 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); 1625 tsk->signal->utime));
1626 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1627 tsk->signal->stime));
1680 1628
1681 info.si_status = tsk->exit_code & 0x7f; 1629 info.si_status = tsk->exit_code & 0x7f;
1682 if (tsk->exit_code & 0x80) 1630 if (tsk->exit_code & 0x80)
@@ -1754,8 +1702,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1754 * see comment in do_notify_parent() about the following 4 lines 1702 * see comment in do_notify_parent() about the following 4 lines
1755 */ 1703 */
1756 rcu_read_lock(); 1704 rcu_read_lock();
1757 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); 1705 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1706 info.si_uid = __task_cred(tsk)->uid;
1759 rcu_read_unlock(); 1707 rcu_read_unlock();
1760 1708
1761 info.si_utime = cputime_to_clock_t(tsk->utime); 1709 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1910,7 +1858,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1910 preempt_disable(); 1858 preempt_disable();
1911 read_unlock(&tasklist_lock); 1859 read_unlock(&tasklist_lock);
1912 preempt_enable_no_resched(); 1860 preempt_enable_no_resched();
1913 freezable_schedule(); 1861 schedule();
1914 } else { 1862 } else {
1915 /* 1863 /*
1916 * By the time we got the lock, our tracer went away. 1864 * By the time we got the lock, our tracer went away.
@@ -1932,6 +1880,13 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1932 } 1880 }
1933 1881
1934 /* 1882 /*
1883 * While in TASK_TRACED, we were considered "frozen enough".
1884 * Now that we woke up, it's crucial if we're supposed to be
1885 * frozen that we freeze now before running anything substantial.
1886 */
1887 try_to_freeze();
1888
1889 /*
1935 * We are back. Now reacquire the siglock before touching 1890 * We are back. Now reacquire the siglock before touching
1936 * last_siginfo, so that we are sure to have synchronized with 1891 * last_siginfo, so that we are sure to have synchronized with
1937 * any signal-sending on another CPU that wants to examine it. 1892 * any signal-sending on another CPU that wants to examine it.
@@ -1958,7 +1913,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1958 info.si_signo = signr; 1913 info.si_signo = signr;
1959 info.si_code = exit_code; 1914 info.si_code = exit_code;
1960 info.si_pid = task_pid_vnr(current); 1915 info.si_pid = task_pid_vnr(current);
1961 info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 1916 info.si_uid = current_uid();
1962 1917
1963 /* Let the debugger run. */ 1918 /* Let the debugger run. */
1964 ptrace_stop(exit_code, why, 1, &info); 1919 ptrace_stop(exit_code, why, 1, &info);
@@ -1967,8 +1922,6 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1967void ptrace_notify(int exit_code) 1922void ptrace_notify(int exit_code)
1968{ 1923{
1969 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); 1924 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1970 if (unlikely(current->task_works))
1971 task_work_run();
1972 1925
1973 spin_lock_irq(&current->sighand->siglock); 1926 spin_lock_irq(&current->sighand->siglock);
1974 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); 1927 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2087,7 +2040,7 @@ static bool do_signal_stop(int signr)
2087 } 2040 }
2088 2041
2089 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2042 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2090 freezable_schedule(); 2043 schedule();
2091 return true; 2044 return true;
2092 } else { 2045 } else {
2093 /* 2046 /*
@@ -2133,9 +2086,10 @@ static void do_jobctl_trap(void)
2133 } 2086 }
2134} 2087}
2135 2088
2136static int ptrace_signal(int signr, siginfo_t *info) 2089static int ptrace_signal(int signr, siginfo_t *info,
2090 struct pt_regs *regs, void *cookie)
2137{ 2091{
2138 ptrace_signal_deliver(); 2092 ptrace_signal_deliver(regs, cookie);
2139 /* 2093 /*
2140 * We do not check sig_kernel_stop(signr) but set this marker 2094 * We do not check sig_kernel_stop(signr) but set this marker
2141 * unconditionally because we do not know whether debugger will 2095 * unconditionally because we do not know whether debugger will
@@ -2165,11 +2119,8 @@ static int ptrace_signal(int signr, siginfo_t *info)
2165 info->si_signo = signr; 2119 info->si_signo = signr;
2166 info->si_errno = 0; 2120 info->si_errno = 0;
2167 info->si_code = SI_USER; 2121 info->si_code = SI_USER;
2168 rcu_read_lock();
2169 info->si_pid = task_pid_vnr(current->parent); 2122 info->si_pid = task_pid_vnr(current->parent);
2170 info->si_uid = from_kuid_munged(current_user_ns(), 2123 info->si_uid = task_uid(current->parent);
2171 task_uid(current->parent));
2172 rcu_read_unlock();
2173 } 2124 }
2174 2125
2175 /* If the (new) signal is now blocked, requeue it. */ 2126 /* If the (new) signal is now blocked, requeue it. */
@@ -2188,20 +2139,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2188 struct signal_struct *signal = current->signal; 2139 struct signal_struct *signal = current->signal;
2189 int signr; 2140 int signr;
2190 2141
2191 if (unlikely(current->task_works)) 2142relock:
2192 task_work_run();
2193
2194 if (unlikely(uprobe_deny_signal()))
2195 return 0;
2196
2197 /* 2143 /*
2198 * Do this once, we can't return to user-mode if freezing() == T. 2144 * We'll jump back here after any time we were stopped in TASK_STOPPED.
2199 * do_signal_stop() and ptrace_stop() do freezable_schedule() and 2145 * While in TASK_STOPPED, we were considered "frozen enough".
2200 * thus do not need another check after return. 2146 * Now that we woke up, it's crucial if we're supposed to be
2147 * frozen that we freeze now before running anything substantial.
2201 */ 2148 */
2202 try_to_freeze(); 2149 try_to_freeze();
2203 2150
2204relock:
2205 spin_lock_irq(&sighand->siglock); 2151 spin_lock_irq(&sighand->siglock);
2206 /* 2152 /*
2207 * Every stopped thread goes here after wakeup. Check to see if 2153 * Every stopped thread goes here after wakeup. Check to see if
@@ -2258,7 +2204,8 @@ relock:
2258 break; /* will return 0 */ 2204 break; /* will return 0 */
2259 2205
2260 if (unlikely(current->ptrace) && signr != SIGKILL) { 2206 if (unlikely(current->ptrace) && signr != SIGKILL) {
2261 signr = ptrace_signal(signr, info); 2207 signr = ptrace_signal(signr, info,
2208 regs, cookie);
2262 if (!signr) 2209 if (!signr)
2263 continue; 2210 continue;
2264 } 2211 }
@@ -2343,7 +2290,7 @@ relock:
2343 2290
2344 if (sig_kernel_coredump(signr)) { 2291 if (sig_kernel_coredump(signr)) {
2345 if (print_fatal_signals) 2292 if (print_fatal_signals)
2346 print_fatal_signal(info->si_signo); 2293 print_fatal_signal(regs, info->si_signo);
2347 /* 2294 /*
2348 * If it was able to dump core, this kills all 2295 * If it was able to dump core, this kills all
2349 * other threads in the group and synchronizes with 2296 * other threads in the group and synchronizes with
@@ -2352,7 +2299,7 @@ relock:
2352 * first and our do_group_exit call below will use 2299 * first and our do_group_exit call below will use
2353 * that value and ignore the one we pass it. 2300 * that value and ignore the one we pass it.
2354 */ 2301 */
2355 do_coredump(info); 2302 do_coredump(info->si_signo, info->si_signo, regs);
2356 } 2303 }
2357 2304
2358 /* 2305 /*
@@ -2365,37 +2312,6 @@ relock:
2365 return signr; 2312 return signr;
2366} 2313}
2367 2314
2368/**
2369 * signal_delivered -
2370 * @sig: number of signal being delivered
2371 * @info: siginfo_t of signal being delivered
2372 * @ka: sigaction setting that chose the handler
2373 * @regs: user register state
2374 * @stepping: nonzero if debugger single-step or block-step in use
2375 *
2376 * This function should be called when a signal has succesfully been
2377 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
2378 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2379 * is set in @ka->sa.sa_flags. Tracing is notified.
2380 */
2381void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2382 struct pt_regs *regs, int stepping)
2383{
2384 sigset_t blocked;
2385
2386 /* A signal was successfully delivered, and the
2387 saved sigmask was stored on the signal frame,
2388 and will be restored by sigreturn. So we can
2389 simply clear the restore sigmask flag. */
2390 clear_restore_sigmask();
2391
2392 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2393 if (!(ka->sa.sa_flags & SA_NODEFER))
2394 sigaddset(&blocked, sig);
2395 set_current_blocked(&blocked);
2396 tracehook_signal_handler(sig, info, ka, regs, stepping);
2397}
2398
2399/* 2315/*
2400 * It could be that complete_signal() picked us to notify about the 2316 * It could be that complete_signal() picked us to notify about the
2401 * group-wide signal. Other threads should be notified now to take 2317 * group-wide signal. Other threads should be notified now to take
@@ -2433,15 +2349,8 @@ void exit_signals(struct task_struct *tsk)
2433 int group_stop = 0; 2349 int group_stop = 0;
2434 sigset_t unblocked; 2350 sigset_t unblocked;
2435 2351
2436 /*
2437 * @tsk is about to have PF_EXITING set - lock out users which
2438 * expect stable threadgroup.
2439 */
2440 threadgroup_change_begin(tsk);
2441
2442 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2352 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2443 tsk->flags |= PF_EXITING; 2353 tsk->flags |= PF_EXITING;
2444 threadgroup_change_end(tsk);
2445 return; 2354 return;
2446 } 2355 }
2447 2356
@@ -2451,9 +2360,6 @@ void exit_signals(struct task_struct *tsk)
2451 * see wants_signal(), do_signal_stop(). 2360 * see wants_signal(), do_signal_stop().
2452 */ 2361 */
2453 tsk->flags |= PF_EXITING; 2362 tsk->flags |= PF_EXITING;
2454
2455 threadgroup_change_end(tsk);
2456
2457 if (!signal_pending(tsk)) 2363 if (!signal_pending(tsk))
2458 goto out; 2364 goto out;
2459 2365
@@ -2526,13 +2432,7 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2526 * It is wrong to change ->blocked directly, this helper should be used 2432 * It is wrong to change ->blocked directly, this helper should be used
2527 * to ensure the process can't miss a shared signal we are going to block. 2433 * to ensure the process can't miss a shared signal we are going to block.
2528 */ 2434 */
2529void set_current_blocked(sigset_t *newset) 2435void set_current_blocked(const sigset_t *newset)
2530{
2531 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
2532 __set_current_blocked(newset);
2533}
2534
2535void __set_current_blocked(const sigset_t *newset)
2536{ 2436{
2537 struct task_struct *tsk = current; 2437 struct task_struct *tsk = current;
2538 2438
@@ -2572,7 +2472,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2572 return -EINVAL; 2472 return -EINVAL;
2573 } 2473 }
2574 2474
2575 __set_current_blocked(&newset); 2475 set_current_blocked(&newset);
2576 return 0; 2476 return 0;
2577} 2477}
2578 2478
@@ -2712,13 +2612,6 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2712 err |= __put_user(from->si_uid, &to->si_uid); 2612 err |= __put_user(from->si_uid, &to->si_uid);
2713 err |= __put_user(from->si_ptr, &to->si_ptr); 2613 err |= __put_user(from->si_ptr, &to->si_ptr);
2714 break; 2614 break;
2715#ifdef __ARCH_SIGSYS
2716 case __SI_SYS:
2717 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2718 err |= __put_user(from->si_syscall, &to->si_syscall);
2719 err |= __put_user(from->si_arch, &to->si_arch);
2720 break;
2721#endif
2722 default: /* this is just in case for now ... */ 2615 default: /* this is just in case for now ... */
2723 err |= __put_user(from->si_pid, &to->si_pid); 2616 err |= __put_user(from->si_pid, &to->si_pid);
2724 err |= __put_user(from->si_uid, &to->si_uid); 2617 err |= __put_user(from->si_uid, &to->si_uid);
@@ -2841,7 +2734,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2841 info.si_errno = 0; 2734 info.si_errno = 0;
2842 info.si_code = SI_USER; 2735 info.si_code = SI_USER;
2843 info.si_pid = task_tgid_vnr(current); 2736 info.si_pid = task_tgid_vnr(current);
2844 info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 2737 info.si_uid = current_uid();
2845 2738
2846 return kill_something_info(sig, &info, pid); 2739 return kill_something_info(sig, &info, pid);
2847} 2740}
@@ -2884,7 +2777,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2884 info.si_errno = 0; 2777 info.si_errno = 0;
2885 info.si_code = SI_TKILL; 2778 info.si_code = SI_TKILL;
2886 info.si_pid = task_tgid_vnr(current); 2779 info.si_pid = task_tgid_vnr(current);
2887 info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 2780 info.si_uid = current_uid();
2888 2781
2889 return do_send_specific(tgid, pid, sig, &info); 2782 return do_send_specific(tgid, pid, sig, &info);
2890} 2783}
@@ -3092,79 +2985,6 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3092out: 2985out:
3093 return error; 2986 return error;
3094} 2987}
3095#ifdef CONFIG_GENERIC_SIGALTSTACK
3096SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3097{
3098 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3099}
3100#endif
3101
3102int restore_altstack(const stack_t __user *uss)
3103{
3104 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
3105 /* squash all but EFAULT for now */
3106 return err == -EFAULT ? err : 0;
3107}
3108
3109int __save_altstack(stack_t __user *uss, unsigned long sp)
3110{
3111 struct task_struct *t = current;
3112 return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
3113 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3114 __put_user(t->sas_ss_size, &uss->ss_size);
3115}
3116
3117#ifdef CONFIG_COMPAT
3118#ifdef CONFIG_GENERIC_SIGALTSTACK
3119asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
3120 compat_stack_t __user *uoss_ptr)
3121{
3122 stack_t uss, uoss;
3123 int ret;
3124 mm_segment_t seg;
3125
3126 if (uss_ptr) {
3127 compat_stack_t uss32;
3128
3129 memset(&uss, 0, sizeof(stack_t));
3130 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3131 return -EFAULT;
3132 uss.ss_sp = compat_ptr(uss32.ss_sp);
3133 uss.ss_flags = uss32.ss_flags;
3134 uss.ss_size = uss32.ss_size;
3135 }
3136 seg = get_fs();
3137 set_fs(KERNEL_DS);
3138 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3139 (stack_t __force __user *) &uoss,
3140 compat_user_stack_pointer());
3141 set_fs(seg);
3142 if (ret >= 0 && uoss_ptr) {
3143 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
3144 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
3145 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
3146 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
3147 ret = -EFAULT;
3148 }
3149 return ret;
3150}
3151
3152int compat_restore_altstack(const compat_stack_t __user *uss)
3153{
3154 int err = compat_sys_sigaltstack(uss, NULL);
3155 /* squash all but -EFAULT for now */
3156 return err == -EFAULT ? err : 0;
3157}
3158
3159int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3160{
3161 struct task_struct *t = current;
3162 return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
3163 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3164 __put_user(t->sas_ss_size, &uss->ss_size);
3165}
3166#endif
3167#endif
3168 2988
3169#ifdef __ARCH_WANT_SYS_SIGPENDING 2989#ifdef __ARCH_WANT_SYS_SIGPENDING
3170 2990
@@ -3201,6 +3021,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3201 if (nset) { 3021 if (nset) {
3202 if (copy_from_user(&new_set, nset, sizeof(*nset))) 3022 if (copy_from_user(&new_set, nset, sizeof(*nset)))
3203 return -EFAULT; 3023 return -EFAULT;
3024 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
3204 3025
3205 new_blocked = current->blocked; 3026 new_blocked = current->blocked;
3206 3027
@@ -3282,7 +3103,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3282 int old = current->blocked.sig[0]; 3103 int old = current->blocked.sig[0];
3283 sigset_t newset; 3104 sigset_t newset;
3284 3105
3285 siginitset(&newset, newmask); 3106 siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
3286 set_current_blocked(&newset); 3107 set_current_blocked(&newset);
3287 3108
3288 return old; 3109 return old;
@@ -3321,17 +3142,6 @@ SYSCALL_DEFINE0(pause)
3321 3142
3322#endif 3143#endif
3323 3144
3324int sigsuspend(sigset_t *set)
3325{
3326 current->saved_sigmask = current->blocked;
3327 set_current_blocked(set);
3328
3329 current->state = TASK_INTERRUPTIBLE;
3330 schedule();
3331 set_restore_sigmask();
3332 return -ERESTARTNOHAND;
3333}
3334
3335#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 3145#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3336/** 3146/**
3337 * sys_rt_sigsuspend - replace the signal mask for a value with the 3147 * sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3349,7 +3159,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3349 3159
3350 if (copy_from_user(&newset, unewset, sizeof(newset))) 3160 if (copy_from_user(&newset, unewset, sizeof(newset)))
3351 return -EFAULT; 3161 return -EFAULT;
3352 return sigsuspend(&newset); 3162 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3163
3164 current->saved_sigmask = current->blocked;
3165 set_current_blocked(&newset);
3166
3167 current->state = TASK_INTERRUPTIBLE;
3168 schedule();
3169 set_restore_sigmask();
3170 return -ERESTARTNOHAND;
3353} 3171}
3354#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3172#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
3355 3173
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f..fb67dfa8394 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,15 +6,13 @@
6#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
7#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/export.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#include "smpboot.h"
17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct { 17static struct {
20 struct list_head queue; 18 struct list_head queue;
@@ -581,6 +579,26 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
581 return 0; 579 return 0;
582} 580}
583EXPORT_SYMBOL(smp_call_function); 581EXPORT_SYMBOL(smp_call_function);
582
583void ipi_call_lock(void)
584{
585 raw_spin_lock(&call_function.lock);
586}
587
588void ipi_call_unlock(void)
589{
590 raw_spin_unlock(&call_function.lock);
591}
592
593void ipi_call_lock_irq(void)
594{
595 raw_spin_lock_irq(&call_function.lock);
596}
597
598void ipi_call_unlock_irq(void)
599{
600 raw_spin_unlock_irq(&call_function.lock);
601}
584#endif /* USE_GENERIC_SMP_HELPERS */ 602#endif /* USE_GENERIC_SMP_HELPERS */
585 603
586/* Setup configured maximum number of CPUs to activate */ 604/* Setup configured maximum number of CPUs to activate */
@@ -651,8 +669,6 @@ void __init smp_init(void)
651{ 669{
652 unsigned int cpu; 670 unsigned int cpu;
653 671
654 idle_threads_init();
655
656 /* FIXME: This should be done in userspace --RR */ 672 /* FIXME: This should be done in userspace --RR */
657 for_each_present_cpu(cpu) { 673 for_each_present_cpu(cpu) {
658 if (num_online_cpus() >= setup_max_cpus) 674 if (num_online_cpus() >= setup_max_cpus)
@@ -685,116 +701,3 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
685 return ret; 701 return ret;
686} 702}
687EXPORT_SYMBOL(on_each_cpu); 703EXPORT_SYMBOL(on_each_cpu);
688
689/**
690 * on_each_cpu_mask(): Run a function on processors specified by
691 * cpumask, which may include the local processor.
692 * @mask: The set of cpus to run on (only runs on online subset).
693 * @func: The function to run. This must be fast and non-blocking.
694 * @info: An arbitrary pointer to pass to the function.
695 * @wait: If true, wait (atomically) until function has completed
696 * on other CPUs.
697 *
698 * If @wait is true, then returns once @func has returned.
699 *
700 * You must not call this function with disabled interrupts or
701 * from a hardware interrupt handler or from a bottom half handler.
702 */
703void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
704 void *info, bool wait)
705{
706 int cpu = get_cpu();
707
708 smp_call_function_many(mask, func, info, wait);
709 if (cpumask_test_cpu(cpu, mask)) {
710 local_irq_disable();
711 func(info);
712 local_irq_enable();
713 }
714 put_cpu();
715}
716EXPORT_SYMBOL(on_each_cpu_mask);
717
718/*
719 * on_each_cpu_cond(): Call a function on each processor for which
720 * the supplied function cond_func returns true, optionally waiting
721 * for all the required CPUs to finish. This may include the local
722 * processor.
723 * @cond_func: A callback function that is passed a cpu id and
724 * the the info parameter. The function is called
725 * with preemption disabled. The function should
726 * return a blooean value indicating whether to IPI
727 * the specified CPU.
728 * @func: The function to run on all applicable CPUs.
729 * This must be fast and non-blocking.
730 * @info: An arbitrary pointer to pass to both functions.
731 * @wait: If true, wait (atomically) until function has
732 * completed on other CPUs.
733 * @gfp_flags: GFP flags to use when allocating the cpumask
734 * used internally by the function.
735 *
736 * The function might sleep if the GFP flags indicates a non
737 * atomic allocation is allowed.
738 *
739 * Preemption is disabled to protect against CPUs going offline but not online.
740 * CPUs going online during the call will not be seen or sent an IPI.
741 *
742 * You must not call this function with disabled interrupts or
743 * from a hardware interrupt handler or from a bottom half handler.
744 */
745void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
746 smp_call_func_t func, void *info, bool wait,
747 gfp_t gfp_flags)
748{
749 cpumask_var_t cpus;
750 int cpu, ret;
751
752 might_sleep_if(gfp_flags & __GFP_WAIT);
753
754 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
755 preempt_disable();
756 for_each_online_cpu(cpu)
757 if (cond_func(cpu, info))
758 cpumask_set_cpu(cpu, cpus);
759 on_each_cpu_mask(cpus, func, info, wait);
760 preempt_enable();
761 free_cpumask_var(cpus);
762 } else {
763 /*
764 * No free cpumask, bother. No matter, we'll
765 * just have to IPI them one by one.
766 */
767 preempt_disable();
768 for_each_online_cpu(cpu)
769 if (cond_func(cpu, info)) {
770 ret = smp_call_function_single(cpu, func,
771 info, wait);
772 WARN_ON_ONCE(!ret);
773 }
774 preempt_enable();
775 }
776}
777EXPORT_SYMBOL(on_each_cpu_cond);
778
779static void do_nothing(void *unused)
780{
781}
782
783/**
784 * kick_all_cpus_sync - Force all cpus out of idle
785 *
786 * Used to synchronize the update of pm_idle function pointer. It's
787 * called after the pointer is updated and returns after the dummy
788 * callback function has been executed on all cpus. The execution of
789 * the function can only happen on the remote cpus after they have
790 * left the idle function which had been called via pm_idle function
791 * pointer. So it's guaranteed that nothing uses the previous pointer
792 * anymore.
793 */
794void kick_all_cpus_sync(void)
795{
796 /* Make sure the change is visible before we kick the cpus */
797 smp_mb();
798 smp_call_function(do_nothing, NULL, 1);
799}
800EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
deleted file mode 100644
index d6c5fc05424..00000000000
--- a/kernel/smpboot.c
+++ /dev/null
@@ -1,300 +0,0 @@
1/*
2 * Common SMP CPU bringup/teardown functions
3 */
4#include <linux/cpu.h>
5#include <linux/err.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/list.h>
9#include <linux/slab.h>
10#include <linux/sched.h>
11#include <linux/export.h>
12#include <linux/percpu.h>
13#include <linux/kthread.h>
14#include <linux/smpboot.h>
15
16#include "smpboot.h"
17
18#ifdef CONFIG_SMP
19
20#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
21/*
22 * For the hotplug case we keep the task structs around and reuse
23 * them.
24 */
25static DEFINE_PER_CPU(struct task_struct *, idle_threads);
26
27struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
28{
29 struct task_struct *tsk = per_cpu(idle_threads, cpu);
30
31 if (!tsk)
32 return ERR_PTR(-ENOMEM);
33 init_idle(tsk, cpu);
34 return tsk;
35}
36
37void __init idle_thread_set_boot_cpu(void)
38{
39 per_cpu(idle_threads, smp_processor_id()) = current;
40}
41
42/**
43 * idle_init - Initialize the idle thread for a cpu
44 * @cpu: The cpu for which the idle thread should be initialized
45 *
46 * Creates the thread if it does not exist.
47 */
48static inline void idle_init(unsigned int cpu)
49{
50 struct task_struct *tsk = per_cpu(idle_threads, cpu);
51
52 if (!tsk) {
53 tsk = fork_idle(cpu);
54 if (IS_ERR(tsk))
55 pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
56 else
57 per_cpu(idle_threads, cpu) = tsk;
58 }
59}
60
61/**
62 * idle_threads_init - Initialize idle threads for all cpus
63 */
64void __init idle_threads_init(void)
65{
66 unsigned int cpu, boot_cpu;
67
68 boot_cpu = smp_processor_id();
69
70 for_each_possible_cpu(cpu) {
71 if (cpu != boot_cpu)
72 idle_init(cpu);
73 }
74}
75#endif
76
77#endif /* #ifdef CONFIG_SMP */
78
79static LIST_HEAD(hotplug_threads);
80static DEFINE_MUTEX(smpboot_threads_lock);
81
82struct smpboot_thread_data {
83 unsigned int cpu;
84 unsigned int status;
85 struct smp_hotplug_thread *ht;
86};
87
88enum {
89 HP_THREAD_NONE = 0,
90 HP_THREAD_ACTIVE,
91 HP_THREAD_PARKED,
92};
93
94/**
95 * smpboot_thread_fn - percpu hotplug thread loop function
96 * @data: thread data pointer
97 *
98 * Checks for thread stop and park conditions. Calls the necessary
99 * setup, cleanup, park and unpark functions for the registered
100 * thread.
101 *
102 * Returns 1 when the thread should exit, 0 otherwise.
103 */
104static int smpboot_thread_fn(void *data)
105{
106 struct smpboot_thread_data *td = data;
107 struct smp_hotplug_thread *ht = td->ht;
108
109 while (1) {
110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable();
112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING);
114 preempt_enable();
115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu));
117 kfree(td);
118 return 0;
119 }
120
121 if (kthread_should_park()) {
122 __set_current_state(TASK_RUNNING);
123 preempt_enable();
124 if (ht->park && td->status == HP_THREAD_ACTIVE) {
125 BUG_ON(td->cpu != smp_processor_id());
126 ht->park(td->cpu);
127 td->status = HP_THREAD_PARKED;
128 }
129 kthread_parkme();
130 /* We might have been woken for stop */
131 continue;
132 }
133
134 BUG_ON(td->cpu != smp_processor_id());
135
136 /* Check for state change setup */
137 switch (td->status) {
138 case HP_THREAD_NONE:
139 preempt_enable();
140 if (ht->setup)
141 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE;
143 preempt_disable();
144 break;
145 case HP_THREAD_PARKED:
146 preempt_enable();
147 if (ht->unpark)
148 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE;
150 preempt_disable();
151 break;
152 }
153
154 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable();
156 schedule();
157 } else {
158 set_current_state(TASK_RUNNING);
159 preempt_enable();
160 ht->thread_fn(td->cpu);
161 }
162 }
163}
164
165static int
166__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
167{
168 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
169 struct smpboot_thread_data *td;
170
171 if (tsk)
172 return 0;
173
174 td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
175 if (!td)
176 return -ENOMEM;
177 td->cpu = cpu;
178 td->ht = ht;
179
180 tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
181 ht->thread_comm);
182 if (IS_ERR(tsk)) {
183 kfree(td);
184 return PTR_ERR(tsk);
185 }
186
187 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk;
189 return 0;
190}
191
192int smpboot_create_threads(unsigned int cpu)
193{
194 struct smp_hotplug_thread *cur;
195 int ret = 0;
196
197 mutex_lock(&smpboot_threads_lock);
198 list_for_each_entry(cur, &hotplug_threads, list) {
199 ret = __smpboot_create_thread(cur, cpu);
200 if (ret)
201 break;
202 }
203 mutex_unlock(&smpboot_threads_lock);
204 return ret;
205}
206
207static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
208{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210
211 kthread_unpark(tsk);
212}
213
214void smpboot_unpark_threads(unsigned int cpu)
215{
216 struct smp_hotplug_thread *cur;
217
218 mutex_lock(&smpboot_threads_lock);
219 list_for_each_entry(cur, &hotplug_threads, list)
220 smpboot_unpark_thread(cur, cpu);
221 mutex_unlock(&smpboot_threads_lock);
222}
223
224static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227
228 if (tsk)
229 kthread_park(tsk);
230}
231
232void smpboot_park_threads(unsigned int cpu)
233{
234 struct smp_hotplug_thread *cur;
235
236 mutex_lock(&smpboot_threads_lock);
237 list_for_each_entry_reverse(cur, &hotplug_threads, list)
238 smpboot_park_thread(cur, cpu);
239 mutex_unlock(&smpboot_threads_lock);
240}
241
242static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
243{
244 unsigned int cpu;
245
246 /* We need to destroy also the parked threads of offline cpus */
247 for_each_possible_cpu(cpu) {
248 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
249
250 if (tsk) {
251 kthread_stop(tsk);
252 put_task_struct(tsk);
253 *per_cpu_ptr(ht->store, cpu) = NULL;
254 }
255 }
256}
257
258/**
259 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
260 * @plug_thread: Hotplug thread descriptor
261 *
262 * Creates and starts the threads on all online cpus.
263 */
264int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
265{
266 unsigned int cpu;
267 int ret = 0;
268
269 mutex_lock(&smpboot_threads_lock);
270 for_each_online_cpu(cpu) {
271 ret = __smpboot_create_thread(plug_thread, cpu);
272 if (ret) {
273 smpboot_destroy_threads(plug_thread);
274 goto out;
275 }
276 smpboot_unpark_thread(plug_thread, cpu);
277 }
278 list_add(&plug_thread->list, &hotplug_threads);
279out:
280 mutex_unlock(&smpboot_threads_lock);
281 return ret;
282}
283EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
284
285/**
286 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
287 * @plug_thread: Hotplug thread descriptor
288 *
289 * Stops all threads on all possible cpus.
290 */
291void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
292{
293 get_online_cpus();
294 mutex_lock(&smpboot_threads_lock);
295 list_del(&plug_thread->list);
296 smpboot_destroy_threads(plug_thread);
297 mutex_unlock(&smpboot_threads_lock);
298 put_online_cpus();
299}
300EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
deleted file mode 100644
index 72415a0eb95..00000000000
--- a/kernel/smpboot.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef SMPBOOT_H
2#define SMPBOOT_H
3
4struct task_struct;
5
6#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
7struct task_struct *idle_thread_get(unsigned int cpu);
8void idle_thread_set_boot_cpu(void);
9void idle_threads_init(void);
10#else
11static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
12static inline void idle_thread_set_boot_cpu(void) { }
13static inline void idle_threads_init(void) { }
14#endif
15
16int smpboot_create_threads(unsigned int cpu);
17void smpboot_park_threads(unsigned int cpu);
18void smpboot_unpark_threads(unsigned int cpu);
19
20#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe78..fca82c32042 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
10 * Remote softirq infrastructure is by Jens Axboe. 10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 11 */
12 12
13#include <linux/export.h> 13#include <linux/module.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/init.h> 16#include <linux/init.h>
@@ -23,7 +23,6 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smpboot.h>
27#include <linux/tick.h> 26#include <linux/tick.h>
28 27
29#define CREATE_TRACE_POINTS 28#define CREATE_TRACE_POINTS
@@ -211,17 +210,9 @@ asmlinkage void __do_softirq(void)
211 __u32 pending; 210 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
213 int cpu; 212 int cpu;
214 unsigned long old_flags = current->flags;
215
216 /*
217 * Mask out PF_MEMALLOC s current task context is borrowed for the
218 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
219 * again if the socket is related to swap
220 */
221 current->flags &= ~PF_MEMALLOC;
222 213
223 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
224 vtime_account_irq_enter(current); 215 account_system_vtime(current);
225 216
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 218 SOFTIRQ_OFFSET);
@@ -272,9 +263,8 @@ restart:
272 263
273 lockdep_softirq_exit(); 264 lockdep_softirq_exit();
274 265
275 vtime_account_irq_exit(current); 266 account_system_vtime(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 267 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 268}
279 269
280#ifndef __ARCH_HAS_DO_SOFTIRQ 270#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -307,7 +297,7 @@ void irq_enter(void)
307 int cpu = smp_processor_id(); 297 int cpu = smp_processor_id();
308 298
309 rcu_irq_enter(); 299 rcu_irq_enter();
310 if (is_idle_task(current) && !in_interrupt()) { 300 if (idle_cpu(cpu) && !in_interrupt()) {
311 /* 301 /*
312 * Prevent raise_softirq from needlessly waking up ksoftirqd 302 * Prevent raise_softirq from needlessly waking up ksoftirqd
313 * here, as softirq will be serviced on return from interrupt. 303 * here, as softirq will be serviced on return from interrupt.
@@ -320,40 +310,50 @@ void irq_enter(void)
320 __irq_enter(); 310 __irq_enter();
321} 311}
322 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
323static inline void invoke_softirq(void) 314static inline void invoke_softirq(void)
324{ 315{
325 if (!force_irqthreads) { 316 if (!force_irqthreads)
326#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
327 __do_softirq(); 317 __do_softirq();
318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
324}
328#else 325#else
326static inline void invoke_softirq(void)
327{
328 if (!force_irqthreads)
329 do_softirq(); 329 do_softirq();
330#endif 330 else {
331 } else {
332 __local_bh_disable((unsigned long)__builtin_return_address(0), 331 __local_bh_disable((unsigned long)__builtin_return_address(0),
333 SOFTIRQ_OFFSET); 332 SOFTIRQ_OFFSET);
334 wakeup_softirqd(); 333 wakeup_softirqd();
335 __local_bh_enable(SOFTIRQ_OFFSET); 334 __local_bh_enable(SOFTIRQ_OFFSET);
336 } 335 }
337} 336}
337#endif
338 338
339/* 339/*
340 * Exit an interrupt context. Process softirqs if needed and possible: 340 * Exit an interrupt context. Process softirqs if needed and possible:
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 vtime_account_irq_exit(current); 344 account_system_vtime(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
350#ifdef CONFIG_NO_HZ 351#ifdef CONFIG_NO_HZ
351 /* Make sure that timer wheel updates are propagated */ 352 /* Make sure that timer wheel updates are propagated */
352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
353 tick_nohz_irq_exit(); 354 tick_nohz_stop_sched_tick(0);
354#endif 355#endif
355 rcu_irq_exit(); 356 preempt_enable_no_resched();
356 sched_preempt_enable_no_resched();
357} 357}
358 358
359/* 359/*
@@ -385,12 +385,6 @@ void raise_softirq(unsigned int nr)
385 local_irq_restore(flags); 385 local_irq_restore(flags);
386} 386}
387 387
388void __raise_softirq_irqoff(unsigned int nr)
389{
390 trace_softirq_raise(nr);
391 or_softirq_pending(1UL << nr);
392}
393
394void open_softirq(int nr, void (*action)(struct softirq_action *)) 388void open_softirq(int nr, void (*action)(struct softirq_action *))
395{ 389{
396 softirq_vec[nr].action = action; 390 softirq_vec[nr].action = action;
@@ -743,22 +737,51 @@ void __init softirq_init(void)
743 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 737 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
744} 738}
745 739
746static int ksoftirqd_should_run(unsigned int cpu) 740static int run_ksoftirqd(void * __bind_cpu)
747{ 741{
748 return local_softirq_pending(); 742 set_current_state(TASK_INTERRUPTIBLE);
749}
750 743
751static void run_ksoftirqd(unsigned int cpu) 744 while (!kthread_should_stop()) {
752{ 745 preempt_disable();
753 local_irq_disable(); 746 if (!local_softirq_pending()) {
754 if (local_softirq_pending()) { 747 preempt_enable_no_resched();
755 __do_softirq(); 748 schedule();
756 rcu_note_context_switch(cpu); 749 preempt_disable();
757 local_irq_enable(); 750 }
758 cond_resched(); 751
759 return; 752 __set_current_state(TASK_RUNNING);
753
754 while (local_softirq_pending()) {
755 /* Preempt disable stops cpu going offline.
756 If already offline, we'll be on wrong CPU:
757 don't process */
758 if (cpu_is_offline((long)__bind_cpu))
759 goto wait_to_die;
760 local_irq_disable();
761 if (local_softirq_pending())
762 __do_softirq();
763 local_irq_enable();
764 preempt_enable_no_resched();
765 cond_resched();
766 preempt_disable();
767 rcu_note_context_switch((long)__bind_cpu);
768 }
769 preempt_enable();
770 set_current_state(TASK_INTERRUPTIBLE);
760 } 771 }
761 local_irq_enable(); 772 __set_current_state(TASK_RUNNING);
773 return 0;
774
775wait_to_die:
776 preempt_enable();
777 /* Wait for kthread_stop */
778 set_current_state(TASK_INTERRUPTIBLE);
779 while (!kthread_should_stop()) {
780 schedule();
781 set_current_state(TASK_INTERRUPTIBLE);
782 }
783 __set_current_state(TASK_RUNNING);
784 return 0;
762} 785}
763 786
764#ifdef CONFIG_HOTPLUG_CPU 787#ifdef CONFIG_HOTPLUG_CPU
@@ -824,14 +847,50 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
824 unsigned long action, 847 unsigned long action,
825 void *hcpu) 848 void *hcpu)
826{ 849{
850 int hotcpu = (unsigned long)hcpu;
851 struct task_struct *p;
852
827 switch (action) { 853 switch (action) {
854 case CPU_UP_PREPARE:
855 case CPU_UP_PREPARE_FROZEN:
856 p = kthread_create_on_node(run_ksoftirqd,
857 hcpu,
858 cpu_to_node(hotcpu),
859 "ksoftirqd/%d", hotcpu);
860 if (IS_ERR(p)) {
861 printk("ksoftirqd for %i failed\n", hotcpu);
862 return notifier_from_errno(PTR_ERR(p));
863 }
864 kthread_bind(p, hotcpu);
865 per_cpu(ksoftirqd, hotcpu) = p;
866 break;
867 case CPU_ONLINE:
868 case CPU_ONLINE_FROZEN:
869 wake_up_process(per_cpu(ksoftirqd, hotcpu));
870 break;
828#ifdef CONFIG_HOTPLUG_CPU 871#ifdef CONFIG_HOTPLUG_CPU
872 case CPU_UP_CANCELED:
873 case CPU_UP_CANCELED_FROZEN:
874 if (!per_cpu(ksoftirqd, hotcpu))
875 break;
876 /* Unbind so it can run. Fall thru. */
877 kthread_bind(per_cpu(ksoftirqd, hotcpu),
878 cpumask_any(cpu_online_mask));
829 case CPU_DEAD: 879 case CPU_DEAD:
830 case CPU_DEAD_FROZEN: 880 case CPU_DEAD_FROZEN: {
831 takeover_tasklets((unsigned long)hcpu); 881 static const struct sched_param param = {
882 .sched_priority = MAX_RT_PRIO-1
883 };
884
885 p = per_cpu(ksoftirqd, hotcpu);
886 per_cpu(ksoftirqd, hotcpu) = NULL;
887 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
888 kthread_stop(p);
889 takeover_tasklets(hotcpu);
832 break; 890 break;
833#endif /* CONFIG_HOTPLUG_CPU */
834 } 891 }
892#endif /* CONFIG_HOTPLUG_CPU */
893 }
835 return NOTIFY_OK; 894 return NOTIFY_OK;
836} 895}
837 896
@@ -839,19 +898,14 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
839 .notifier_call = cpu_callback 898 .notifier_call = cpu_callback
840}; 899};
841 900
842static struct smp_hotplug_thread softirq_threads = {
843 .store = &ksoftirqd,
844 .thread_should_run = ksoftirqd_should_run,
845 .thread_fn = run_ksoftirqd,
846 .thread_comm = "ksoftirqd/%u",
847};
848
849static __init int spawn_ksoftirqd(void) 901static __init int spawn_ksoftirqd(void)
850{ 902{
851 register_cpu_notifier(&cpu_nfb); 903 void *cpu = (void *)(long)smp_processor_id();
852 904 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
853 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
854 905
906 BUG_ON(err != NOTIFY_OK);
907 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
908 register_cpu_notifier(&cpu_nfb);
855 return 0; 909 return 0;
856} 910}
857early_initcall(spawn_ksoftirqd); 911early_initcall(spawn_ksoftirqd);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3c..be6517fb9c1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/export.h> 22#include <linux/module.h>
23 23
24/* 24/*
25 * If lockdep is enabled then we use the non-preemption spin-ops 25 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
163EXPORT_SYMBOL(_raw_spin_lock_bh); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
164#endif 164#endif
165 165
166#ifdef CONFIG_UNINLINE_SPIN_UNLOCK 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
168{ 168{
169 __raw_spin_unlock(lock); 169 __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc..73ce23feaea 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,17 +16,15 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 * 19 *
21 * Author: Paul McKenney <paulmck@us.ibm.com> 20 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 21 *
24 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 23 * Documentation/RCU/ *.txt
26 * 24 *
27 */ 25 */
28 26
29#include <linux/export.h> 27#include <linux/module.h>
30#include <linux/mutex.h> 28#include <linux/mutex.h>
31#include <linux/percpu.h> 29#include <linux/percpu.h>
32#include <linux/preempt.h> 30#include <linux/preempt.h>
@@ -36,78 +34,10 @@
36#include <linux/delay.h> 34#include <linux/delay.h>
37#include <linux/srcu.h> 35#include <linux/srcu.h>
38 36
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
43/*
44 * Initialize an rcu_batch structure to empty.
45 */
46static inline void rcu_batch_init(struct rcu_batch *b)
47{
48 b->head = NULL;
49 b->tail = &b->head;
50}
51
52/*
53 * Enqueue a callback onto the tail of the specified rcu_batch structure.
54 */
55static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
56{
57 *b->tail = head;
58 b->tail = &head->next;
59}
60
61/*
62 * Is the specified rcu_batch structure empty?
63 */
64static inline bool rcu_batch_empty(struct rcu_batch *b)
65{
66 return b->tail == &b->head;
67}
68
69/*
70 * Remove the callback at the head of the specified rcu_batch structure
71 * and return a pointer to it, or return NULL if the structure is empty.
72 */
73static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
74{
75 struct rcu_head *head;
76
77 if (rcu_batch_empty(b))
78 return NULL;
79
80 head = b->head;
81 b->head = head->next;
82 if (b->tail == &head->next)
83 rcu_batch_init(b);
84
85 return head;
86}
87
88/*
89 * Move all callbacks from the rcu_batch structure specified by "from" to
90 * the structure specified by "to".
91 */
92static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
93{
94 if (!rcu_batch_empty(from)) {
95 *to->tail = from->head;
96 to->tail = from->tail;
97 rcu_batch_init(from);
98 }
99}
100
101static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
102{ 38{
103 sp->completed = 0; 39 sp->completed = 0;
104 spin_lock_init(&sp->queue_lock); 40 mutex_init(&sp->mutex);
105 sp->running = false;
106 rcu_batch_init(&sp->batch_queue);
107 rcu_batch_init(&sp->batch_check0);
108 rcu_batch_init(&sp->batch_check1);
109 rcu_batch_init(&sp->batch_done);
110 INIT_DELAYED_WORK(&sp->work, process_srcu);
111 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
112 return sp->per_cpu_ref ? 0 : -ENOMEM; 42 return sp->per_cpu_ref ? 0 : -ENOMEM;
113} 43}
@@ -143,116 +73,21 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
143#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
144 74
145/* 75/*
146 * Returns approximate total of the readers' ->seq[] values for the 76 * srcu_readers_active_idx -- returns approximate number of readers
147 * rank of per-CPU counters specified by idx. 77 * active on the specified rank of per-CPU counters.
148 */ 78 */
149static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
150{
151 int cpu;
152 unsigned long sum = 0;
153 unsigned long t;
154
155 for_each_possible_cpu(cpu) {
156 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
157 sum += t;
158 }
159 return sum;
160}
161 79
162/* 80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
163 * Returns approximate number of readers active on the specified rank
164 * of the per-CPU ->c[] counters.
165 */
166static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
167{ 81{
168 int cpu; 82 int cpu;
169 unsigned long sum = 0; 83 int sum;
170 unsigned long t;
171 84
172 for_each_possible_cpu(cpu) { 85 sum = 0;
173 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); 86 for_each_possible_cpu(cpu)
174 sum += t; 87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
175 }
176 return sum; 88 return sum;
177} 89}
178 90
179/*
180 * Return true if the number of pre-existing readers is determined to
181 * be stably zero. An example unstable zero can occur if the call
182 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
183 * but due to task migration, sees the corresponding __srcu_read_unlock()
184 * decrement. This can happen because srcu_readers_active_idx() takes
185 * time to sum the array, and might in fact be interrupted or preempted
186 * partway through the summation.
187 */
188static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
189{
190 unsigned long seq;
191
192 seq = srcu_readers_seq_idx(sp, idx);
193
194 /*
195 * The following smp_mb() A pairs with the smp_mb() B located in
196 * __srcu_read_lock(). This pairing ensures that if an
197 * __srcu_read_lock() increments its counter after the summation
198 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
199 * critical section will see any changes made prior to the start
200 * of the current SRCU grace period.
201 *
202 * Also, if the above call to srcu_readers_seq_idx() saw the
203 * increment of ->seq[], then the call to srcu_readers_active_idx()
204 * must see the increment of ->c[].
205 */
206 smp_mb(); /* A */
207
208 /*
209 * Note that srcu_readers_active_idx() can incorrectly return
210 * zero even though there is a pre-existing reader throughout.
211 * To see this, suppose that task A is in a very long SRCU
212 * read-side critical section that started on CPU 0, and that
213 * no other reader exists, so that the sum of the counters
214 * is equal to one. Then suppose that task B starts executing
215 * srcu_readers_active_idx(), summing up to CPU 1, and then that
216 * task C starts reading on CPU 0, so that its increment is not
217 * summed, but finishes reading on CPU 2, so that its decrement
218 * -is- summed. Then when task B completes its sum, it will
219 * incorrectly get zero, despite the fact that task A has been
220 * in its SRCU read-side critical section the whole time.
221 *
222 * We therefore do a validation step should srcu_readers_active_idx()
223 * return zero.
224 */
225 if (srcu_readers_active_idx(sp, idx) != 0)
226 return false;
227
228 /*
229 * The remainder of this function is the validation step.
230 * The following smp_mb() D pairs with the smp_mb() C in
231 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
232 * by srcu_readers_active_idx() above, then any destructive
233 * operation performed after the grace period will happen after
234 * the corresponding SRCU read-side critical section.
235 *
236 * Note that there can be at most NR_CPUS worth of readers using
237 * the old index, which is not enough to overflow even a 32-bit
238 * integer. (Yes, this does mean that systems having more than
239 * a billion or so CPUs need to be 64-bit systems.) Therefore,
240 * the sum of the ->seq[] counters cannot possibly overflow.
241 * Therefore, the only way that the return values of the two
242 * calls to srcu_readers_seq_idx() can be equal is if there were
243 * no increments of the corresponding rank of ->seq[] counts
244 * in the interim. But the missed-increment scenario laid out
245 * above includes an increment of the ->seq[] counter by
246 * the corresponding __srcu_read_lock(). Therefore, if this
247 * scenario occurs, the return values from the two calls to
248 * srcu_readers_seq_idx() will differ, and thus the validation
249 * step below suffices.
250 */
251 smp_mb(); /* D */
252
253 return srcu_readers_seq_idx(sp, idx) == seq;
254}
255
256/** 91/**
257 * srcu_readers_active - returns approximate number of readers. 92 * srcu_readers_active - returns approximate number of readers.
258 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -263,14 +98,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
263 */ 98 */
264static int srcu_readers_active(struct srcu_struct *sp) 99static int srcu_readers_active(struct srcu_struct *sp)
265{ 100{
266 int cpu; 101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
267 unsigned long sum = 0;
268
269 for_each_possible_cpu(cpu) {
270 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
271 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
272 }
273 return sum;
274} 102}
275 103
276/** 104/**
@@ -303,11 +131,10 @@ int __srcu_read_lock(struct srcu_struct *sp)
303 int idx; 131 int idx;
304 132
305 preempt_disable(); 133 preempt_disable();
306 idx = rcu_dereference_index_check(sp->completed, 134 idx = sp->completed & 0x1;
307 rcu_read_lock_sched_held()) & 0x1; 135 barrier(); /* ensure compiler looks -once- at sp->completed. */
308 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
309 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 137 srcu_barrier(); /* ensure compiler won't misorder critical section. */
310 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
311 preempt_enable(); 138 preempt_enable();
312 return idx; 139 return idx;
313} 140}
@@ -322,8 +149,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
322void __srcu_read_unlock(struct srcu_struct *sp, int idx) 149void __srcu_read_unlock(struct srcu_struct *sp, int idx)
323{ 150{
324 preempt_disable(); 151 preempt_disable();
325 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 152 srcu_barrier(); /* ensure compiler won't misorder critical section. */
326 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
327 preempt_enable(); 154 preempt_enable();
328} 155}
329EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -336,119 +163,100 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
336 * we repeatedly block for 1-millisecond time periods. This approach 163 * we repeatedly block for 1-millisecond time periods. This approach
337 * has done well in testing, so there is no need for a config parameter. 164 * has done well in testing, so there is no need for a config parameter.
338 */ 165 */
339#define SRCU_RETRY_CHECK_DELAY 5 166#define SYNCHRONIZE_SRCU_READER_DELAY 10
340#define SYNCHRONIZE_SRCU_TRYCOUNT 2
341#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
342 167
343/* 168/*
344 * @@@ Wait until all pre-existing readers complete. Such readers 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
345 * will have used the index specified by "idx".
346 * the caller should ensures the ->completed is not changed while checking
347 * and idx = (->completed & 1) ^ 1
348 */ 170 */
349static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
350{ 172{
351 for (;;) { 173 int idx;
352 if (srcu_readers_active_idx_check(sp, idx))
353 return true;
354 if (--trycount <= 0)
355 return false;
356 udelay(SRCU_RETRY_CHECK_DELAY);
357 }
358}
359 174
360/* 175 idx = sp->completed;
361 * Increment the ->completed counter so that future SRCU readers will 176 mutex_lock(&sp->mutex);
362 * use the other rank of the ->c[] and ->seq[] arrays. This allows
363 * us to wait for pre-existing readers in a starvation-free manner.
364 */
365static void srcu_flip(struct srcu_struct *sp)
366{
367 sp->completed++;
368}
369 177
370/* 178 /*
371 * Enqueue an SRCU callback on the specified srcu_struct structure, 179 * Check to see if someone else did the work for us while we were
372 * initiating grace-period processing if it is not already running. 180 * waiting to acquire the lock. We need -two- advances of
373 */ 181 * the counter, not just one. If there was but one, we might have
374void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 182 * shown up -after- our helper's first synchronize_sched(), thus
375 void (*func)(struct rcu_head *head)) 183 * having failed to prevent CPU-reordering races with concurrent
376{ 184 * srcu_read_unlock()s on other CPUs (see comment below). So we
377 unsigned long flags; 185 * either (1) wait for two or (2) supply the second ourselves.
378 186 */
379 head->next = NULL; 187
380 head->func = func; 188 if ((sp->completed - idx) >= 2) {
381 spin_lock_irqsave(&sp->queue_lock, flags); 189 mutex_unlock(&sp->mutex);
382 rcu_batch_queue(&sp->batch_queue, head); 190 return;
383 if (!sp->running) {
384 sp->running = true;
385 schedule_delayed_work(&sp->work, 0);
386 } 191 }
387 spin_unlock_irqrestore(&sp->queue_lock, flags);
388}
389EXPORT_SYMBOL_GPL(call_srcu);
390 192
391struct rcu_synchronize { 193 sync_func(); /* Force memory barrier on all CPUs. */
392 struct rcu_head head;
393 struct completion completion;
394};
395 194
396/* 195 /*
397 * Awaken the corresponding synchronize_srcu() instance now that a 196 * The preceding synchronize_sched() ensures that any CPU that
398 * grace period has elapsed. 197 * sees the new value of sp->completed will also see any preceding
399 */ 198 * changes to data structures made by this CPU. This prevents
400static void wakeme_after_rcu(struct rcu_head *head) 199 * some other CPU from reordering the accesses in its SRCU
401{ 200 * read-side critical section to precede the corresponding
402 struct rcu_synchronize *rcu; 201 * srcu_read_lock() -- ensuring that such references will in
202 * fact be protected.
203 *
204 * So it is now safe to do the flip.
205 */
403 206
404 rcu = container_of(head, struct rcu_synchronize, head); 207 idx = sp->completed & 0x1;
405 complete(&rcu->completion); 208 sp->completed++;
406}
407 209
408static void srcu_advance_batches(struct srcu_struct *sp, int trycount); 210 sync_func(); /* Force memory barrier on all CPUs. */
409static void srcu_reschedule(struct srcu_struct *sp);
410 211
411/* 212 /*
412 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 213 * At this point, because of the preceding synchronize_sched(),
413 */ 214 * all srcu_read_lock() calls using the old counters have completed.
414static void __synchronize_srcu(struct srcu_struct *sp, int trycount) 215 * Their corresponding critical sections might well be still
415{ 216 * executing, but the srcu_read_lock() primitives themselves
416 struct rcu_synchronize rcu; 217 * will have finished executing. We initially give readers
417 struct rcu_head *head = &rcu.head; 218 * an arbitrarily chosen 10 microseconds to get out of their
418 bool done = false; 219 * SRCU read-side critical sections, then loop waiting 1/HZ
419 220 * seconds per iteration. The 10-microsecond value has done
420 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 221 * very well in testing.
421 !lock_is_held(&rcu_bh_lock_map) && 222 */
422 !lock_is_held(&rcu_lock_map) && 223
423 !lock_is_held(&rcu_sched_lock_map), 224 if (srcu_readers_active_idx(sp, idx))
424 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
425 226 while (srcu_readers_active_idx(sp, idx))
426 init_completion(&rcu.completion); 227 schedule_timeout_interruptible(1);
427 228
428 head->next = NULL; 229 sync_func(); /* Force memory barrier on all CPUs. */
429 head->func = wakeme_after_rcu;
430 spin_lock_irq(&sp->queue_lock);
431 if (!sp->running) {
432 /* steal the processing owner */
433 sp->running = true;
434 rcu_batch_queue(&sp->batch_check0, head);
435 spin_unlock_irq(&sp->queue_lock);
436
437 srcu_advance_batches(sp, trycount);
438 if (!rcu_batch_empty(&sp->batch_done)) {
439 BUG_ON(sp->batch_done.head != head);
440 rcu_batch_dequeue(&sp->batch_done);
441 done = true;
442 }
443 /* give the processing owner to work_struct */
444 srcu_reschedule(sp);
445 } else {
446 rcu_batch_queue(&sp->batch_queue, head);
447 spin_unlock_irq(&sp->queue_lock);
448 }
449 230
450 if (!done) 231 /*
451 wait_for_completion(&rcu.completion); 232 * The preceding synchronize_sched() forces all srcu_read_unlock()
233 * primitives that were executing concurrently with the preceding
234 * for_each_possible_cpu() loop to have completed by this point.
235 * More importantly, it also forces the corresponding SRCU read-side
236 * critical sections to have also completed, and the corresponding
237 * references to SRCU-protected data items to be dropped.
238 *
239 * Note:
240 *
241 * Despite what you might think at first glance, the
242 * preceding synchronize_sched() -must- be within the
243 * critical section ended by the following mutex_unlock().
244 * Otherwise, a task taking the early exit can race
245 * with a srcu_read_unlock(), which might have executed
246 * just before the preceding srcu_readers_active() check,
247 * and whose CPU might have reordered the srcu_read_unlock()
248 * with the preceding critical section. In this case, there
249 * is nothing preventing the synchronize_sched() task that is
250 * taking the early exit from freeing a data structure that
251 * is still being referenced (out of order) by the task
252 * doing the srcu_read_unlock().
253 *
254 * Alternatively, the comparison with "2" on the early exit
255 * could be changed to "3", but this increases synchronize_srcu()
256 * latency for bulk loads. So the current code is preferred.
257 */
258
259 mutex_unlock(&sp->mutex);
452} 260}
453 261
454/** 262/**
@@ -467,190 +275,41 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
467 */ 275 */
468void synchronize_srcu(struct srcu_struct *sp) 276void synchronize_srcu(struct srcu_struct *sp)
469{ 277{
470 __synchronize_srcu(sp, rcu_expedited 278 __synchronize_srcu(sp, synchronize_sched);
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
473} 279}
474EXPORT_SYMBOL_GPL(synchronize_srcu); 280EXPORT_SYMBOL_GPL(synchronize_srcu);
475 281
476/** 282/**
477 * synchronize_srcu_expedited - Brute-force SRCU grace period 283 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
478 * @sp: srcu_struct with which to synchronize. 284 * @sp: srcu_struct with which to synchronize.
479 * 285 *
480 * Wait for an SRCU grace period to elapse, but be more aggressive about 286 * Flip the completed counter, and wait for the old count to drain to zero.
481 * spinning rather than blocking when waiting. 287 * As with classic RCU, the updater must use some separate means of
288 * synchronizing concurrent updates. Can block; must be called from
289 * process context.
482 * 290 *
483 * Note that it is illegal to call this function while holding any lock 291 * Note that it is illegal to call synchronize_srcu_expedited()
484 * that is acquired by a CPU-hotplug notifier. It is also illegal to call 292 * from the corresponding SRCU read-side critical section; doing so
485 * synchronize_srcu_expedited() from the corresponding SRCU read-side 293 * will result in deadlock. However, it is perfectly legal to call
486 * critical section; doing so will result in deadlock. However, it is 294 * synchronize_srcu_expedited() on one srcu_struct from some other
487 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 295 * srcu_struct's read-side critical section.
488 * from some other srcu_struct's read-side critical section, as long as
489 * the resulting graph of srcu_structs is acyclic.
490 */ 296 */
491void synchronize_srcu_expedited(struct srcu_struct *sp) 297void synchronize_srcu_expedited(struct srcu_struct *sp)
492{ 298{
493 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); 299 __synchronize_srcu(sp, synchronize_sched_expedited);
494} 300}
495EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 301EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
496 302
497/** 303/**
498 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
499 */
500void srcu_barrier(struct srcu_struct *sp)
501{
502 synchronize_srcu(sp);
503}
504EXPORT_SYMBOL_GPL(srcu_barrier);
505
506/**
507 * srcu_batches_completed - return batches completed. 304 * srcu_batches_completed - return batches completed.
508 * @sp: srcu_struct on which to report batch completion. 305 * @sp: srcu_struct on which to report batch completion.
509 * 306 *
510 * Report the number of batches, correlated with, but not necessarily 307 * Report the number of batches, correlated with, but not necessarily
511 * precisely the same as, the number of grace periods that have elapsed. 308 * precisely the same as, the number of grace periods that have elapsed.
512 */ 309 */
310
513long srcu_batches_completed(struct srcu_struct *sp) 311long srcu_batches_completed(struct srcu_struct *sp)
514{ 312{
515 return sp->completed; 313 return sp->completed;
516} 314}
517EXPORT_SYMBOL_GPL(srcu_batches_completed); 315EXPORT_SYMBOL_GPL(srcu_batches_completed);
518
519#define SRCU_CALLBACK_BATCH 10
520#define SRCU_INTERVAL 1
521
522/*
523 * Move any new SRCU callbacks to the first stage of the SRCU grace
524 * period pipeline.
525 */
526static void srcu_collect_new(struct srcu_struct *sp)
527{
528 if (!rcu_batch_empty(&sp->batch_queue)) {
529 spin_lock_irq(&sp->queue_lock);
530 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
531 spin_unlock_irq(&sp->queue_lock);
532 }
533}
534
535/*
536 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
537 * ->batch_check1 and then to ->batch_done as readers drain.
538 */
539static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
540{
541 int idx = 1 ^ (sp->completed & 1);
542
543 /*
544 * Because readers might be delayed for an extended period after
545 * fetching ->completed for their index, at any point in time there
546 * might well be readers using both idx=0 and idx=1. We therefore
547 * need to wait for readers to clear from both index values before
548 * invoking a callback.
549 */
550
551 if (rcu_batch_empty(&sp->batch_check0) &&
552 rcu_batch_empty(&sp->batch_check1))
553 return; /* no callbacks need to be advanced */
554
555 if (!try_check_zero(sp, idx, trycount))
556 return; /* failed to advance, will try after SRCU_INTERVAL */
557
558 /*
559 * The callbacks in ->batch_check1 have already done with their
560 * first zero check and flip back when they were enqueued on
561 * ->batch_check0 in a previous invocation of srcu_advance_batches().
562 * (Presumably try_check_zero() returned false during that
563 * invocation, leaving the callbacks stranded on ->batch_check1.)
564 * They are therefore ready to invoke, so move them to ->batch_done.
565 */
566 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
567
568 if (rcu_batch_empty(&sp->batch_check0))
569 return; /* no callbacks need to be advanced */
570 srcu_flip(sp);
571
572 /*
573 * The callbacks in ->batch_check0 just finished their
574 * first check zero and flip, so move them to ->batch_check1
575 * for future checking on the other idx.
576 */
577 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
578
579 /*
580 * SRCU read-side critical sections are normally short, so check
581 * at least twice in quick succession after a flip.
582 */
583 trycount = trycount < 2 ? 2 : trycount;
584 if (!try_check_zero(sp, idx^1, trycount))
585 return; /* failed to advance, will try after SRCU_INTERVAL */
586
587 /*
588 * The callbacks in ->batch_check1 have now waited for all
589 * pre-existing readers using both idx values. They are therefore
590 * ready to invoke, so move them to ->batch_done.
591 */
592 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
593}
594
595/*
596 * Invoke a limited number of SRCU callbacks that have passed through
597 * their grace period. If there are more to do, SRCU will reschedule
598 * the workqueue.
599 */
600static void srcu_invoke_callbacks(struct srcu_struct *sp)
601{
602 int i;
603 struct rcu_head *head;
604
605 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
606 head = rcu_batch_dequeue(&sp->batch_done);
607 if (!head)
608 break;
609 local_bh_disable();
610 head->func(head);
611 local_bh_enable();
612 }
613}
614
615/*
616 * Finished one round of SRCU grace period. Start another if there are
617 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
618 */
619static void srcu_reschedule(struct srcu_struct *sp)
620{
621 bool pending = true;
622
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 spin_lock_irq(&sp->queue_lock);
628 if (rcu_batch_empty(&sp->batch_done) &&
629 rcu_batch_empty(&sp->batch_check1) &&
630 rcu_batch_empty(&sp->batch_check0) &&
631 rcu_batch_empty(&sp->batch_queue)) {
632 sp->running = false;
633 pending = false;
634 }
635 spin_unlock_irq(&sp->queue_lock);
636 }
637
638 if (pending)
639 schedule_delayed_work(&sp->work, SRCU_INTERVAL);
640}
641
642/*
643 * This is the work-queue function that handles SRCU grace periods.
644 */
645void process_srcu(struct work_struct *work)
646{
647 struct srcu_struct *sp;
648
649 sp = container_of(work, struct srcu_struct, work.work);
650
651 srcu_collect_new(sp);
652 srcu_advance_batches(sp, 1);
653 srcu_invoke_callbacks(sp);
654 srcu_reschedule(sp);
655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a8..d20c6983aad 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/export.h> 10#include <linux/module.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
13 13
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e96571..ba5070ce576 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/export.h> 15#include <linux/module.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
@@ -41,7 +41,6 @@ struct cpu_stopper {
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44static bool stop_machine_initialized = false;
45 44
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47{ 46{
@@ -387,8 +386,6 @@ static int __init cpu_stop_init(void)
387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 386 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
388 register_cpu_notifier(&cpu_stop_cpu_notifier); 387 register_cpu_notifier(&cpu_stop_cpu_notifier);
389 388
390 stop_machine_initialized = true;
391
392 return 0; 389 return 0;
393} 390}
394early_initcall(cpu_stop_init); 391early_initcall(cpu_stop_init);
@@ -488,25 +485,6 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
488 .num_threads = num_online_cpus(), 485 .num_threads = num_online_cpus(),
489 .active_cpus = cpus }; 486 .active_cpus = cpus };
490 487
491 if (!stop_machine_initialized) {
492 /*
493 * Handle the case where stop_machine() is called
494 * early in boot before stop_machine() has been
495 * initialized.
496 */
497 unsigned long flags;
498 int ret;
499
500 WARN_ON_ONCE(smdata.num_threads != 1);
501
502 local_irq_save(flags);
503 hard_irq_disable();
504 ret = (*fn)(data);
505 local_irq_restore(flags);
506
507 return ret;
508 }
509
510 /* Set the initial state and stop all online cpus. */ 488 /* Set the initial state and stop all online cpus. */
511 set_state(&smdata, STOPMACHINE_PREPARE); 489 set_state(&smdata, STOPMACHINE_PREPARE);
512 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 490 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b3769042..1dbbe695a5e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/export.h> 7#include <linux/module.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
@@ -12,7 +12,6 @@
12#include <linux/prctl.h> 12#include <linux/prctl.h>
13#include <linux/highuid.h> 13#include <linux/highuid.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/kmod.h>
16#include <linux/perf_event.h> 15#include <linux/perf_event.h>
17#include <linux/resource.h> 16#include <linux/resource.h>
18#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -36,8 +35,6 @@
36#include <linux/personality.h> 35#include <linux/personality.h>
37#include <linux/ptrace.h> 36#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 37#include <linux/fs_struct.h>
39#include <linux/file.h>
40#include <linux/mount.h>
41#include <linux/gfp.h> 38#include <linux/gfp.h>
42#include <linux/syscore_ops.h> 39#include <linux/syscore_ops.h>
43#include <linux/version.h> 40#include <linux/version.h>
@@ -95,8 +92,10 @@
95int overflowuid = DEFAULT_OVERFLOWUID; 92int overflowuid = DEFAULT_OVERFLOWUID;
96int overflowgid = DEFAULT_OVERFLOWGID; 93int overflowgid = DEFAULT_OVERFLOWGID;
97 94
95#ifdef CONFIG_UID16
98EXPORT_SYMBOL(overflowuid); 96EXPORT_SYMBOL(overflowuid);
99EXPORT_SYMBOL(overflowgid); 97EXPORT_SYMBOL(overflowgid);
98#endif
100 99
101/* 100/*
102 * the same as above, but for filesystems which can only store a 16-bit 101 * the same as above, but for filesystems which can only store a 16-bit
@@ -133,10 +132,11 @@ static bool set_one_prio_perm(struct task_struct *p)
133{ 132{
134 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 133 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
135 134
136 if (uid_eq(pcred->uid, cred->euid) || 135 if (pcred->user->user_ns == cred->user->user_ns &&
137 uid_eq(pcred->euid, cred->euid)) 136 (pcred->uid == cred->euid ||
137 pcred->euid == cred->euid))
138 return true; 138 return true;
139 if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) 139 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
140 return true; 140 return true;
141 return false; 141 return false;
142} 142}
@@ -176,7 +176,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
176 const struct cred *cred = current_cred(); 176 const struct cred *cred = current_cred();
177 int error = -EINVAL; 177 int error = -EINVAL;
178 struct pid *pgrp; 178 struct pid *pgrp;
179 kuid_t uid;
180 179
181 if (which > PRIO_USER || which < PRIO_PROCESS) 180 if (which > PRIO_USER || which < PRIO_PROCESS)
182 goto out; 181 goto out;
@@ -209,19 +208,18 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
209 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 208 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
210 break; 209 break;
211 case PRIO_USER: 210 case PRIO_USER:
212 uid = make_kuid(cred->user_ns, who); 211 user = (struct user_struct *) cred->user;
213 user = cred->user;
214 if (!who) 212 if (!who)
215 uid = cred->uid; 213 who = cred->uid;
216 else if (!uid_eq(uid, cred->uid) && 214 else if ((who != cred->uid) &&
217 !(user = find_user(uid))) 215 !(user = find_user(who)))
218 goto out_unlock; /* No processes for this user */ 216 goto out_unlock; /* No processes for this user */
219 217
220 do_each_thread(g, p) { 218 do_each_thread(g, p) {
221 if (uid_eq(task_uid(p), uid)) 219 if (__task_cred(p)->uid == who)
222 error = set_one_prio(p, niceval, error); 220 error = set_one_prio(p, niceval, error);
223 } while_each_thread(g, p); 221 } while_each_thread(g, p);
224 if (!uid_eq(uid, cred->uid)) 222 if (who != cred->uid)
225 free_uid(user); /* For find_user() */ 223 free_uid(user); /* For find_user() */
226 break; 224 break;
227 } 225 }
@@ -245,7 +243,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
245 const struct cred *cred = current_cred(); 243 const struct cred *cred = current_cred();
246 long niceval, retval = -ESRCH; 244 long niceval, retval = -ESRCH;
247 struct pid *pgrp; 245 struct pid *pgrp;
248 kuid_t uid;
249 246
250 if (which > PRIO_USER || which < PRIO_PROCESS) 247 if (which > PRIO_USER || which < PRIO_PROCESS)
251 return -EINVAL; 248 return -EINVAL;
@@ -276,22 +273,21 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
276 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 273 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
277 break; 274 break;
278 case PRIO_USER: 275 case PRIO_USER:
279 uid = make_kuid(cred->user_ns, who); 276 user = (struct user_struct *) cred->user;
280 user = cred->user;
281 if (!who) 277 if (!who)
282 uid = cred->uid; 278 who = cred->uid;
283 else if (!uid_eq(uid, cred->uid) && 279 else if ((who != cred->uid) &&
284 !(user = find_user(uid))) 280 !(user = find_user(who)))
285 goto out_unlock; /* No processes for this user */ 281 goto out_unlock; /* No processes for this user */
286 282
287 do_each_thread(g, p) { 283 do_each_thread(g, p) {
288 if (uid_eq(task_uid(p), uid)) { 284 if (__task_cred(p)->uid == who) {
289 niceval = 20 - task_nice(p); 285 niceval = 20 - task_nice(p);
290 if (niceval > retval) 286 if (niceval > retval)
291 retval = niceval; 287 retval = niceval;
292 } 288 }
293 } while_each_thread(g, p); 289 } while_each_thread(g, p);
294 if (!uid_eq(uid, cred->uid)) 290 if (who != cred->uid)
295 free_uid(user); /* for find_user() */ 291 free_uid(user); /* for find_user() */
296 break; 292 break;
297 } 293 }
@@ -368,7 +364,6 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
368void kernel_restart(char *cmd) 364void kernel_restart(char *cmd)
369{ 365{
370 kernel_restart_prepare(cmd); 366 kernel_restart_prepare(cmd);
371 disable_nonboot_cpus();
372 if (!cmd) 367 if (!cmd)
373 printk(KERN_EMERG "Restarting system.\n"); 368 printk(KERN_EMERG "Restarting system.\n");
374 else 369 else
@@ -448,15 +443,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
448 magic2 != LINUX_REBOOT_MAGIC2C)) 443 magic2 != LINUX_REBOOT_MAGIC2C))
449 return -EINVAL; 444 return -EINVAL;
450 445
451 /*
452 * If pid namespaces are enabled and the current task is in a child
453 * pid_namespace, the command is handled by reboot_pid_ns() which will
454 * call do_exit().
455 */
456 ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
457 if (ret)
458 return ret;
459
460 /* Instead of trying to make the power_off code look like 446 /* Instead of trying to make the power_off code look like
461 * halt when pm_power_off is not set do it the easy way. 447 * halt when pm_power_off is not set do it the easy way.
462 */ 448 */
@@ -557,19 +543,9 @@ void ctrl_alt_del(void)
557 */ 543 */
558SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 544SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
559{ 545{
560 struct user_namespace *ns = current_user_ns();
561 const struct cred *old; 546 const struct cred *old;
562 struct cred *new; 547 struct cred *new;
563 int retval; 548 int retval;
564 kgid_t krgid, kegid;
565
566 krgid = make_kgid(ns, rgid);
567 kegid = make_kgid(ns, egid);
568
569 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
570 return -EINVAL;
571 if ((egid != (gid_t) -1) && !gid_valid(kegid))
572 return -EINVAL;
573 549
574 new = prepare_creds(); 550 new = prepare_creds();
575 if (!new) 551 if (!new)
@@ -578,25 +554,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
578 554
579 retval = -EPERM; 555 retval = -EPERM;
580 if (rgid != (gid_t) -1) { 556 if (rgid != (gid_t) -1) {
581 if (gid_eq(old->gid, krgid) || 557 if (old->gid == rgid ||
582 gid_eq(old->egid, krgid) || 558 old->egid == rgid ||
583 nsown_capable(CAP_SETGID)) 559 nsown_capable(CAP_SETGID))
584 new->gid = krgid; 560 new->gid = rgid;
585 else 561 else
586 goto error; 562 goto error;
587 } 563 }
588 if (egid != (gid_t) -1) { 564 if (egid != (gid_t) -1) {
589 if (gid_eq(old->gid, kegid) || 565 if (old->gid == egid ||
590 gid_eq(old->egid, kegid) || 566 old->egid == egid ||
591 gid_eq(old->sgid, kegid) || 567 old->sgid == egid ||
592 nsown_capable(CAP_SETGID)) 568 nsown_capable(CAP_SETGID))
593 new->egid = kegid; 569 new->egid = egid;
594 else 570 else
595 goto error; 571 goto error;
596 } 572 }
597 573
598 if (rgid != (gid_t) -1 || 574 if (rgid != (gid_t) -1 ||
599 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) 575 (egid != (gid_t) -1 && egid != old->gid))
600 new->sgid = new->egid; 576 new->sgid = new->egid;
601 new->fsgid = new->egid; 577 new->fsgid = new->egid;
602 578
@@ -614,15 +590,9 @@ error:
614 */ 590 */
615SYSCALL_DEFINE1(setgid, gid_t, gid) 591SYSCALL_DEFINE1(setgid, gid_t, gid)
616{ 592{
617 struct user_namespace *ns = current_user_ns();
618 const struct cred *old; 593 const struct cred *old;
619 struct cred *new; 594 struct cred *new;
620 int retval; 595 int retval;
621 kgid_t kgid;
622
623 kgid = make_kgid(ns, gid);
624 if (!gid_valid(kgid))
625 return -EINVAL;
626 596
627 new = prepare_creds(); 597 new = prepare_creds();
628 if (!new) 598 if (!new)
@@ -631,9 +601,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
631 601
632 retval = -EPERM; 602 retval = -EPERM;
633 if (nsown_capable(CAP_SETGID)) 603 if (nsown_capable(CAP_SETGID))
634 new->gid = new->egid = new->sgid = new->fsgid = kgid; 604 new->gid = new->egid = new->sgid = new->fsgid = gid;
635 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 605 else if (gid == old->gid || gid == old->sgid)
636 new->egid = new->fsgid = kgid; 606 new->egid = new->fsgid = gid;
637 else 607 else
638 goto error; 608 goto error;
639 609
@@ -651,7 +621,7 @@ static int set_user(struct cred *new)
651{ 621{
652 struct user_struct *new_user; 622 struct user_struct *new_user;
653 623
654 new_user = alloc_uid(new->uid); 624 new_user = alloc_uid(current_user_ns(), new->uid);
655 if (!new_user) 625 if (!new_user)
656 return -EAGAIN; 626 return -EAGAIN;
657 627
@@ -690,19 +660,9 @@ static int set_user(struct cred *new)
690 */ 660 */
691SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 661SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
692{ 662{
693 struct user_namespace *ns = current_user_ns();
694 const struct cred *old; 663 const struct cred *old;
695 struct cred *new; 664 struct cred *new;
696 int retval; 665 int retval;
697 kuid_t kruid, keuid;
698
699 kruid = make_kuid(ns, ruid);
700 keuid = make_kuid(ns, euid);
701
702 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
703 return -EINVAL;
704 if ((euid != (uid_t) -1) && !uid_valid(keuid))
705 return -EINVAL;
706 666
707 new = prepare_creds(); 667 new = prepare_creds();
708 if (!new) 668 if (!new)
@@ -711,29 +671,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
711 671
712 retval = -EPERM; 672 retval = -EPERM;
713 if (ruid != (uid_t) -1) { 673 if (ruid != (uid_t) -1) {
714 new->uid = kruid; 674 new->uid = ruid;
715 if (!uid_eq(old->uid, kruid) && 675 if (old->uid != ruid &&
716 !uid_eq(old->euid, kruid) && 676 old->euid != ruid &&
717 !nsown_capable(CAP_SETUID)) 677 !nsown_capable(CAP_SETUID))
718 goto error; 678 goto error;
719 } 679 }
720 680
721 if (euid != (uid_t) -1) { 681 if (euid != (uid_t) -1) {
722 new->euid = keuid; 682 new->euid = euid;
723 if (!uid_eq(old->uid, keuid) && 683 if (old->uid != euid &&
724 !uid_eq(old->euid, keuid) && 684 old->euid != euid &&
725 !uid_eq(old->suid, keuid) && 685 old->suid != euid &&
726 !nsown_capable(CAP_SETUID)) 686 !nsown_capable(CAP_SETUID))
727 goto error; 687 goto error;
728 } 688 }
729 689
730 if (!uid_eq(new->uid, old->uid)) { 690 if (new->uid != old->uid) {
731 retval = set_user(new); 691 retval = set_user(new);
732 if (retval < 0) 692 if (retval < 0)
733 goto error; 693 goto error;
734 } 694 }
735 if (ruid != (uid_t) -1 || 695 if (ruid != (uid_t) -1 ||
736 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) 696 (euid != (uid_t) -1 && euid != old->uid))
737 new->suid = new->euid; 697 new->suid = new->euid;
738 new->fsuid = new->euid; 698 new->fsuid = new->euid;
739 699
@@ -761,15 +721,9 @@ error:
761 */ 721 */
762SYSCALL_DEFINE1(setuid, uid_t, uid) 722SYSCALL_DEFINE1(setuid, uid_t, uid)
763{ 723{
764 struct user_namespace *ns = current_user_ns();
765 const struct cred *old; 724 const struct cred *old;
766 struct cred *new; 725 struct cred *new;
767 int retval; 726 int retval;
768 kuid_t kuid;
769
770 kuid = make_kuid(ns, uid);
771 if (!uid_valid(kuid))
772 return -EINVAL;
773 727
774 new = prepare_creds(); 728 new = prepare_creds();
775 if (!new) 729 if (!new)
@@ -778,17 +732,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
778 732
779 retval = -EPERM; 733 retval = -EPERM;
780 if (nsown_capable(CAP_SETUID)) { 734 if (nsown_capable(CAP_SETUID)) {
781 new->suid = new->uid = kuid; 735 new->suid = new->uid = uid;
782 if (!uid_eq(kuid, old->uid)) { 736 if (uid != old->uid) {
783 retval = set_user(new); 737 retval = set_user(new);
784 if (retval < 0) 738 if (retval < 0)
785 goto error; 739 goto error;
786 } 740 }
787 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { 741 } else if (uid != old->uid && uid != new->suid) {
788 goto error; 742 goto error;
789 } 743 }
790 744
791 new->fsuid = new->euid = kuid; 745 new->fsuid = new->euid = uid;
792 746
793 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 747 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
794 if (retval < 0) 748 if (retval < 0)
@@ -808,24 +762,9 @@ error:
808 */ 762 */
809SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 763SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
810{ 764{
811 struct user_namespace *ns = current_user_ns();
812 const struct cred *old; 765 const struct cred *old;
813 struct cred *new; 766 struct cred *new;
814 int retval; 767 int retval;
815 kuid_t kruid, keuid, ksuid;
816
817 kruid = make_kuid(ns, ruid);
818 keuid = make_kuid(ns, euid);
819 ksuid = make_kuid(ns, suid);
820
821 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
822 return -EINVAL;
823
824 if ((euid != (uid_t) -1) && !uid_valid(keuid))
825 return -EINVAL;
826
827 if ((suid != (uid_t) -1) && !uid_valid(ksuid))
828 return -EINVAL;
829 768
830 new = prepare_creds(); 769 new = prepare_creds();
831 if (!new) 770 if (!new)
@@ -835,29 +774,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
835 774
836 retval = -EPERM; 775 retval = -EPERM;
837 if (!nsown_capable(CAP_SETUID)) { 776 if (!nsown_capable(CAP_SETUID)) {
838 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 777 if (ruid != (uid_t) -1 && ruid != old->uid &&
839 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 778 ruid != old->euid && ruid != old->suid)
840 goto error; 779 goto error;
841 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && 780 if (euid != (uid_t) -1 && euid != old->uid &&
842 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) 781 euid != old->euid && euid != old->suid)
843 goto error; 782 goto error;
844 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && 783 if (suid != (uid_t) -1 && suid != old->uid &&
845 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) 784 suid != old->euid && suid != old->suid)
846 goto error; 785 goto error;
847 } 786 }
848 787
849 if (ruid != (uid_t) -1) { 788 if (ruid != (uid_t) -1) {
850 new->uid = kruid; 789 new->uid = ruid;
851 if (!uid_eq(kruid, old->uid)) { 790 if (ruid != old->uid) {
852 retval = set_user(new); 791 retval = set_user(new);
853 if (retval < 0) 792 if (retval < 0)
854 goto error; 793 goto error;
855 } 794 }
856 } 795 }
857 if (euid != (uid_t) -1) 796 if (euid != (uid_t) -1)
858 new->euid = keuid; 797 new->euid = euid;
859 if (suid != (uid_t) -1) 798 if (suid != (uid_t) -1)
860 new->suid = ksuid; 799 new->suid = suid;
861 new->fsuid = new->euid; 800 new->fsuid = new->euid;
862 801
863 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 802 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -871,19 +810,14 @@ error:
871 return retval; 810 return retval;
872} 811}
873 812
874SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) 813SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
875{ 814{
876 const struct cred *cred = current_cred(); 815 const struct cred *cred = current_cred();
877 int retval; 816 int retval;
878 uid_t ruid, euid, suid;
879
880 ruid = from_kuid_munged(cred->user_ns, cred->uid);
881 euid = from_kuid_munged(cred->user_ns, cred->euid);
882 suid = from_kuid_munged(cred->user_ns, cred->suid);
883 817
884 if (!(retval = put_user(ruid, ruidp)) && 818 if (!(retval = put_user(cred->uid, ruid)) &&
885 !(retval = put_user(euid, euidp))) 819 !(retval = put_user(cred->euid, euid)))
886 retval = put_user(suid, suidp); 820 retval = put_user(cred->suid, suid);
887 821
888 return retval; 822 return retval;
889} 823}
@@ -893,22 +827,9 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
893 */ 827 */
894SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 828SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
895{ 829{
896 struct user_namespace *ns = current_user_ns();
897 const struct cred *old; 830 const struct cred *old;
898 struct cred *new; 831 struct cred *new;
899 int retval; 832 int retval;
900 kgid_t krgid, kegid, ksgid;
901
902 krgid = make_kgid(ns, rgid);
903 kegid = make_kgid(ns, egid);
904 ksgid = make_kgid(ns, sgid);
905
906 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
907 return -EINVAL;
908 if ((egid != (gid_t) -1) && !gid_valid(kegid))
909 return -EINVAL;
910 if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
911 return -EINVAL;
912 833
913 new = prepare_creds(); 834 new = prepare_creds();
914 if (!new) 835 if (!new)
@@ -917,23 +838,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
917 838
918 retval = -EPERM; 839 retval = -EPERM;
919 if (!nsown_capable(CAP_SETGID)) { 840 if (!nsown_capable(CAP_SETGID)) {
920 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 841 if (rgid != (gid_t) -1 && rgid != old->gid &&
921 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 842 rgid != old->egid && rgid != old->sgid)
922 goto error; 843 goto error;
923 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && 844 if (egid != (gid_t) -1 && egid != old->gid &&
924 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) 845 egid != old->egid && egid != old->sgid)
925 goto error; 846 goto error;
926 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && 847 if (sgid != (gid_t) -1 && sgid != old->gid &&
927 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) 848 sgid != old->egid && sgid != old->sgid)
928 goto error; 849 goto error;
929 } 850 }
930 851
931 if (rgid != (gid_t) -1) 852 if (rgid != (gid_t) -1)
932 new->gid = krgid; 853 new->gid = rgid;
933 if (egid != (gid_t) -1) 854 if (egid != (gid_t) -1)
934 new->egid = kegid; 855 new->egid = egid;
935 if (sgid != (gid_t) -1) 856 if (sgid != (gid_t) -1)
936 new->sgid = ksgid; 857 new->sgid = sgid;
937 new->fsgid = new->egid; 858 new->fsgid = new->egid;
938 859
939 return commit_creds(new); 860 return commit_creds(new);
@@ -943,19 +864,14 @@ error:
943 return retval; 864 return retval;
944} 865}
945 866
946SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) 867SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
947{ 868{
948 const struct cred *cred = current_cred(); 869 const struct cred *cred = current_cred();
949 int retval; 870 int retval;
950 gid_t rgid, egid, sgid;
951
952 rgid = from_kgid_munged(cred->user_ns, cred->gid);
953 egid = from_kgid_munged(cred->user_ns, cred->egid);
954 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
955 871
956 if (!(retval = put_user(rgid, rgidp)) && 872 if (!(retval = put_user(cred->gid, rgid)) &&
957 !(retval = put_user(egid, egidp))) 873 !(retval = put_user(cred->egid, egid)))
958 retval = put_user(sgid, sgidp); 874 retval = put_user(cred->sgid, sgid);
959 875
960 return retval; 876 return retval;
961} 877}
@@ -972,24 +888,18 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
972 const struct cred *old; 888 const struct cred *old;
973 struct cred *new; 889 struct cred *new;
974 uid_t old_fsuid; 890 uid_t old_fsuid;
975 kuid_t kuid;
976
977 old = current_cred();
978 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
979
980 kuid = make_kuid(old->user_ns, uid);
981 if (!uid_valid(kuid))
982 return old_fsuid;
983 891
984 new = prepare_creds(); 892 new = prepare_creds();
985 if (!new) 893 if (!new)
986 return old_fsuid; 894 return current_fsuid();
895 old = current_cred();
896 old_fsuid = old->fsuid;
987 897
988 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 898 if (uid == old->uid || uid == old->euid ||
989 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 899 uid == old->suid || uid == old->fsuid ||
990 nsown_capable(CAP_SETUID)) { 900 nsown_capable(CAP_SETUID)) {
991 if (!uid_eq(kuid, old->fsuid)) { 901 if (uid != old_fsuid) {
992 new->fsuid = kuid; 902 new->fsuid = uid;
993 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 903 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
994 goto change_okay; 904 goto change_okay;
995 } 905 }
@@ -1011,24 +921,18 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
1011 const struct cred *old; 921 const struct cred *old;
1012 struct cred *new; 922 struct cred *new;
1013 gid_t old_fsgid; 923 gid_t old_fsgid;
1014 kgid_t kgid;
1015
1016 old = current_cred();
1017 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
1018
1019 kgid = make_kgid(old->user_ns, gid);
1020 if (!gid_valid(kgid))
1021 return old_fsgid;
1022 924
1023 new = prepare_creds(); 925 new = prepare_creds();
1024 if (!new) 926 if (!new)
1025 return old_fsgid; 927 return current_fsgid();
928 old = current_cred();
929 old_fsgid = old->fsgid;
1026 930
1027 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 931 if (gid == old->gid || gid == old->egid ||
1028 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 932 gid == old->sgid || gid == old->fsgid ||
1029 nsown_capable(CAP_SETGID)) { 933 nsown_capable(CAP_SETGID)) {
1030 if (!gid_eq(kgid, old->fsgid)) { 934 if (gid != old_fsgid) {
1031 new->fsgid = kgid; 935 new->fsgid = gid;
1032 goto change_okay; 936 goto change_okay;
1033 } 937 }
1034 } 938 }
@@ -1046,7 +950,7 @@ void do_sys_times(struct tms *tms)
1046 cputime_t tgutime, tgstime, cutime, cstime; 950 cputime_t tgutime, tgstime, cutime, cstime;
1047 951
1048 spin_lock_irq(&current->sighand->siglock); 952 spin_lock_irq(&current->sighand->siglock);
1049 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 953 thread_group_times(current, &tgutime, &tgstime);
1050 cutime = current->signal->cutime; 954 cutime = current->signal->cutime;
1051 cstime = current->signal->cstime; 955 cstime = current->signal->cstime;
1052 spin_unlock_irq(&current->sighand->siglock); 956 spin_unlock_irq(&current->sighand->siglock);
@@ -1265,16 +1169,15 @@ DECLARE_RWSEM(uts_sem);
1265 * Work around broken programs that cannot handle "Linux 3.0". 1169 * Work around broken programs that cannot handle "Linux 3.0".
1266 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1170 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1267 */ 1171 */
1268static int override_release(char __user *release, size_t len) 1172static int override_release(char __user *release, int len)
1269{ 1173{
1270 int ret = 0; 1174 int ret = 0;
1175 char buf[65];
1271 1176
1272 if (current->personality & UNAME26) { 1177 if (current->personality & UNAME26) {
1273 const char *rest = UTS_RELEASE; 1178 char *rest = UTS_RELEASE;
1274 char buf[65] = { 0 };
1275 int ndots = 0; 1179 int ndots = 0;
1276 unsigned v; 1180 unsigned v;
1277 size_t copy;
1278 1181
1279 while (*rest) { 1182 while (*rest) {
1280 if (*rest == '.' && ++ndots >= 3) 1183 if (*rest == '.' && ++ndots >= 3)
@@ -1284,9 +1187,8 @@ static int override_release(char __user *release, size_t len)
1284 rest++; 1187 rest++;
1285 } 1188 }
1286 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; 1189 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1287 copy = clamp_t(size_t, len, 1, sizeof(buf)); 1190 snprintf(buf, len, "2.6.%u%s", v, rest);
1288 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); 1191 ret = copy_to_user(release, buf, len);
1289 ret = copy_to_user(release, buf, copy + 1);
1290 } 1192 }
1291 return ret; 1193 return ret;
1292} 1194}
@@ -1383,7 +1285,6 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1383 memcpy(u->nodename, tmp, len); 1285 memcpy(u->nodename, tmp, len);
1384 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1286 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1385 errno = 0; 1287 errno = 0;
1386 uts_proc_notify(UTS_PROC_HOSTNAME);
1387 } 1288 }
1388 up_write(&uts_sem); 1289 up_write(&uts_sem);
1389 return errno; 1290 return errno;
@@ -1434,7 +1335,6 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1434 memcpy(u->domainname, tmp, len); 1335 memcpy(u->domainname, tmp, len);
1435 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1336 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1436 errno = 0; 1337 errno = 0;
1437 uts_proc_notify(UTS_PROC_DOMAINNAME);
1438 } 1338 }
1439 up_write(&uts_sem); 1339 up_write(&uts_sem);
1440 return errno; 1340 return errno;
@@ -1586,14 +1486,15 @@ static int check_prlimit_permission(struct task_struct *task)
1586 return 0; 1486 return 0;
1587 1487
1588 tcred = __task_cred(task); 1488 tcred = __task_cred(task);
1589 if (uid_eq(cred->uid, tcred->euid) && 1489 if (cred->user->user_ns == tcred->user->user_ns &&
1590 uid_eq(cred->uid, tcred->suid) && 1490 (cred->uid == tcred->euid &&
1591 uid_eq(cred->uid, tcred->uid) && 1491 cred->uid == tcred->suid &&
1592 gid_eq(cred->gid, tcred->egid) && 1492 cred->uid == tcred->uid &&
1593 gid_eq(cred->gid, tcred->sgid) && 1493 cred->gid == tcred->egid &&
1594 gid_eq(cred->gid, tcred->gid)) 1494 cred->gid == tcred->sgid &&
1495 cred->gid == tcred->gid))
1595 return 0; 1496 return 0;
1596 if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) 1497 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1597 return 0; 1498 return 0;
1598 1499
1599 return -EPERM; 1500 return -EPERM;
@@ -1701,10 +1602,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1701 unsigned long maxrss = 0; 1602 unsigned long maxrss = 0;
1702 1603
1703 memset((char *) r, 0, sizeof *r); 1604 memset((char *) r, 0, sizeof *r);
1704 utime = stime = 0; 1605 utime = stime = cputime_zero;
1705 1606
1706 if (who == RUSAGE_THREAD) { 1607 if (who == RUSAGE_THREAD) {
1707 task_cputime_adjusted(current, &utime, &stime); 1608 task_times(current, &utime, &stime);
1708 accumulate_thread_rusage(p, r); 1609 accumulate_thread_rusage(p, r);
1709 maxrss = p->signal->maxrss; 1610 maxrss = p->signal->maxrss;
1710 goto out; 1611 goto out;
@@ -1730,9 +1631,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1730 break; 1631 break;
1731 1632
1732 case RUSAGE_SELF: 1633 case RUSAGE_SELF:
1733 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1634 thread_group_times(p, &tgutime, &tgstime);
1734 utime += tgutime; 1635 utime = cputime_add(utime, tgutime);
1735 stime += tgstime; 1636 stime = cputime_add(stime, tgstime);
1736 r->ru_nvcsw += p->signal->nvcsw; 1637 r->ru_nvcsw += p->signal->nvcsw;
1737 r->ru_nivcsw += p->signal->nivcsw; 1638 r->ru_nivcsw += p->signal->nivcsw;
1738 r->ru_minflt += p->signal->min_flt; 1639 r->ru_minflt += p->signal->min_flt;
@@ -1788,217 +1689,6 @@ SYSCALL_DEFINE1(umask, int, mask)
1788 return mask; 1689 return mask;
1789} 1690}
1790 1691
1791#ifdef CONFIG_CHECKPOINT_RESTORE
1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793{
1794 struct fd exe;
1795 struct dentry *dentry;
1796 int err;
1797
1798 exe = fdget(fd);
1799 if (!exe.file)
1800 return -EBADF;
1801
1802 dentry = exe.file->f_path.dentry;
1803
1804 /*
1805 * Because the original mm->exe_file points to executable file, make
1806 * sure that this one is executable as well, to avoid breaking an
1807 * overall picture.
1808 */
1809 err = -EACCES;
1810 if (!S_ISREG(dentry->d_inode->i_mode) ||
1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812 goto exit;
1813
1814 err = inode_permission(dentry->d_inode, MAY_EXEC);
1815 if (err)
1816 goto exit;
1817
1818 down_write(&mm->mmap_sem);
1819
1820 /*
1821 * Forbid mm->exe_file change if old file still mapped.
1822 */
1823 err = -EBUSY;
1824 if (mm->exe_file) {
1825 struct vm_area_struct *vma;
1826
1827 for (vma = mm->mmap; vma; vma = vma->vm_next)
1828 if (vma->vm_file &&
1829 path_equal(&vma->vm_file->f_path,
1830 &mm->exe_file->f_path))
1831 goto exit_unlock;
1832 }
1833
1834 /*
1835 * The symlink can be changed only once, just to disallow arbitrary
1836 * transitions malicious software might bring in. This means one
1837 * could make a snapshot over all processes running and monitor
1838 * /proc/pid/exe changes to notice unusual activity if needed.
1839 */
1840 err = -EPERM;
1841 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1842 goto exit_unlock;
1843
1844 err = 0;
1845 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1846exit_unlock:
1847 up_write(&mm->mmap_sem);
1848
1849exit:
1850 fdput(exe);
1851 return err;
1852}
1853
1854static int prctl_set_mm(int opt, unsigned long addr,
1855 unsigned long arg4, unsigned long arg5)
1856{
1857 unsigned long rlim = rlimit(RLIMIT_DATA);
1858 struct mm_struct *mm = current->mm;
1859 struct vm_area_struct *vma;
1860 int error;
1861
1862 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
1863 return -EINVAL;
1864
1865 if (!capable(CAP_SYS_RESOURCE))
1866 return -EPERM;
1867
1868 if (opt == PR_SET_MM_EXE_FILE)
1869 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1870
1871 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1872 return -EINVAL;
1873
1874 error = -EINVAL;
1875
1876 down_read(&mm->mmap_sem);
1877 vma = find_vma(mm, addr);
1878
1879 switch (opt) {
1880 case PR_SET_MM_START_CODE:
1881 mm->start_code = addr;
1882 break;
1883 case PR_SET_MM_END_CODE:
1884 mm->end_code = addr;
1885 break;
1886 case PR_SET_MM_START_DATA:
1887 mm->start_data = addr;
1888 break;
1889 case PR_SET_MM_END_DATA:
1890 mm->end_data = addr;
1891 break;
1892
1893 case PR_SET_MM_START_BRK:
1894 if (addr <= mm->end_data)
1895 goto out;
1896
1897 if (rlim < RLIM_INFINITY &&
1898 (mm->brk - addr) +
1899 (mm->end_data - mm->start_data) > rlim)
1900 goto out;
1901
1902 mm->start_brk = addr;
1903 break;
1904
1905 case PR_SET_MM_BRK:
1906 if (addr <= mm->end_data)
1907 goto out;
1908
1909 if (rlim < RLIM_INFINITY &&
1910 (addr - mm->start_brk) +
1911 (mm->end_data - mm->start_data) > rlim)
1912 goto out;
1913
1914 mm->brk = addr;
1915 break;
1916
1917 /*
1918 * If command line arguments and environment
1919 * are placed somewhere else on stack, we can
1920 * set them up here, ARG_START/END to setup
1921 * command line argumets and ENV_START/END
1922 * for environment.
1923 */
1924 case PR_SET_MM_START_STACK:
1925 case PR_SET_MM_ARG_START:
1926 case PR_SET_MM_ARG_END:
1927 case PR_SET_MM_ENV_START:
1928 case PR_SET_MM_ENV_END:
1929 if (!vma) {
1930 error = -EFAULT;
1931 goto out;
1932 }
1933 if (opt == PR_SET_MM_START_STACK)
1934 mm->start_stack = addr;
1935 else if (opt == PR_SET_MM_ARG_START)
1936 mm->arg_start = addr;
1937 else if (opt == PR_SET_MM_ARG_END)
1938 mm->arg_end = addr;
1939 else if (opt == PR_SET_MM_ENV_START)
1940 mm->env_start = addr;
1941 else if (opt == PR_SET_MM_ENV_END)
1942 mm->env_end = addr;
1943 break;
1944
1945 /*
1946 * This doesn't move auxiliary vector itself
1947 * since it's pinned to mm_struct, but allow
1948 * to fill vector with new values. It's up
1949 * to a caller to provide sane values here
1950 * otherwise user space tools which use this
1951 * vector might be unhappy.
1952 */
1953 case PR_SET_MM_AUXV: {
1954 unsigned long user_auxv[AT_VECTOR_SIZE];
1955
1956 if (arg4 > sizeof(user_auxv))
1957 goto out;
1958 up_read(&mm->mmap_sem);
1959
1960 if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
1961 return -EFAULT;
1962
1963 /* Make sure the last entry is always AT_NULL */
1964 user_auxv[AT_VECTOR_SIZE - 2] = 0;
1965 user_auxv[AT_VECTOR_SIZE - 1] = 0;
1966
1967 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1968
1969 task_lock(current);
1970 memcpy(mm->saved_auxv, user_auxv, arg4);
1971 task_unlock(current);
1972
1973 return 0;
1974 }
1975 default:
1976 goto out;
1977 }
1978
1979 error = 0;
1980out:
1981 up_read(&mm->mmap_sem);
1982 return error;
1983}
1984
1985static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1986{
1987 return put_user(me->clear_child_tid, tid_addr);
1988}
1989
1990#else /* CONFIG_CHECKPOINT_RESTORE */
1991static int prctl_set_mm(int opt, unsigned long addr,
1992 unsigned long arg4, unsigned long arg5)
1993{
1994 return -EINVAL;
1995}
1996static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1997{
1998 return -EINVAL;
1999}
2000#endif
2001
2002SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1692SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2003 unsigned long, arg4, unsigned long, arg5) 1693 unsigned long, arg4, unsigned long, arg5)
2004{ 1694{
@@ -2018,6 +1708,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2018 break; 1708 break;
2019 } 1709 }
2020 me->pdeath_signal = arg2; 1710 me->pdeath_signal = arg2;
1711 error = 0;
2021 break; 1712 break;
2022 case PR_GET_PDEATHSIG: 1713 case PR_GET_PDEATHSIG:
2023 error = put_user(me->pdeath_signal, (int __user *)arg2); 1714 error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2031,6 +1722,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2031 break; 1722 break;
2032 } 1723 }
2033 set_dumpable(me->mm, arg2); 1724 set_dumpable(me->mm, arg2);
1725 error = 0;
2034 break; 1726 break;
2035 1727
2036 case PR_SET_UNALIGN: 1728 case PR_SET_UNALIGN:
@@ -2057,32 +1749,35 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2057 case PR_SET_TIMING: 1749 case PR_SET_TIMING:
2058 if (arg2 != PR_TIMING_STATISTICAL) 1750 if (arg2 != PR_TIMING_STATISTICAL)
2059 error = -EINVAL; 1751 error = -EINVAL;
1752 else
1753 error = 0;
2060 break; 1754 break;
1755
2061 case PR_SET_NAME: 1756 case PR_SET_NAME:
2062 comm[sizeof(me->comm)-1] = 0; 1757 comm[sizeof(me->comm)-1] = 0;
2063 if (strncpy_from_user(comm, (char __user *)arg2, 1758 if (strncpy_from_user(comm, (char __user *)arg2,
2064 sizeof(me->comm) - 1) < 0) 1759 sizeof(me->comm) - 1) < 0)
2065 return -EFAULT; 1760 return -EFAULT;
2066 set_task_comm(me, comm); 1761 set_task_comm(me, comm);
2067 proc_comm_connector(me); 1762 return 0;
2068 break;
2069 case PR_GET_NAME: 1763 case PR_GET_NAME:
2070 get_task_comm(comm, me); 1764 get_task_comm(comm, me);
2071 if (copy_to_user((char __user *)arg2, comm, 1765 if (copy_to_user((char __user *)arg2, comm,
2072 sizeof(comm))) 1766 sizeof(comm)))
2073 return -EFAULT; 1767 return -EFAULT;
2074 break; 1768 return 0;
2075 case PR_GET_ENDIAN: 1769 case PR_GET_ENDIAN:
2076 error = GET_ENDIAN(me, arg2); 1770 error = GET_ENDIAN(me, arg2);
2077 break; 1771 break;
2078 case PR_SET_ENDIAN: 1772 case PR_SET_ENDIAN:
2079 error = SET_ENDIAN(me, arg2); 1773 error = SET_ENDIAN(me, arg2);
2080 break; 1774 break;
1775
2081 case PR_GET_SECCOMP: 1776 case PR_GET_SECCOMP:
2082 error = prctl_get_seccomp(); 1777 error = prctl_get_seccomp();
2083 break; 1778 break;
2084 case PR_SET_SECCOMP: 1779 case PR_SET_SECCOMP:
2085 error = prctl_set_seccomp(arg2, (char __user *)arg3); 1780 error = prctl_set_seccomp(arg2);
2086 break; 1781 break;
2087 case PR_GET_TSC: 1782 case PR_GET_TSC:
2088 error = GET_TSC_CTL(arg2); 1783 error = GET_TSC_CTL(arg2);
@@ -2105,6 +1800,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2105 current->default_timer_slack_ns; 1800 current->default_timer_slack_ns;
2106 else 1801 else
2107 current->timer_slack_ns = arg2; 1802 current->timer_slack_ns = arg2;
1803 error = 0;
2108 break; 1804 break;
2109 case PR_MCE_KILL: 1805 case PR_MCE_KILL:
2110 if (arg4 | arg5) 1806 if (arg4 | arg5)
@@ -2130,6 +1826,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2130 default: 1826 default:
2131 return -EINVAL; 1827 return -EINVAL;
2132 } 1828 }
1829 error = 0;
2133 break; 1830 break;
2134 case PR_MCE_KILL_GET: 1831 case PR_MCE_KILL_GET:
2135 if (arg2 | arg3 | arg4 | arg5) 1832 if (arg2 | arg3 | arg4 | arg5)
@@ -2140,29 +1837,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2140 else 1837 else
2141 error = PR_MCE_KILL_DEFAULT; 1838 error = PR_MCE_KILL_DEFAULT;
2142 break; 1839 break;
2143 case PR_SET_MM:
2144 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145 break;
2146 case PR_GET_TID_ADDRESS:
2147 error = prctl_get_tid_address(me, (int __user **)arg2);
2148 break;
2149 case PR_SET_CHILD_SUBREAPER:
2150 me->signal->is_child_subreaper = !!arg2;
2151 break;
2152 case PR_GET_CHILD_SUBREAPER:
2153 error = put_user(me->signal->is_child_subreaper,
2154 (int __user *) arg2);
2155 break;
2156 case PR_SET_NO_NEW_PRIVS:
2157 if (arg2 != 1 || arg3 || arg4 || arg5)
2158 return -EINVAL;
2159
2160 current->no_new_privs = 1;
2161 break;
2162 case PR_GET_NO_NEW_PRIVS:
2163 if (arg2 || arg3 || arg4 || arg5)
2164 return -EINVAL;
2165 return current->no_new_privs ? 1 : 0;
2166 default: 1840 default:
2167 error = -EINVAL; 1841 error = -EINVAL;
2168 break; 1842 break;
@@ -2189,52 +1863,49 @@ static void argv_cleanup(struct subprocess_info *info)
2189 argv_free(info->argv); 1863 argv_free(info->argv);
2190} 1864}
2191 1865
2192static int __orderly_poweroff(void) 1866/**
1867 * orderly_poweroff - Trigger an orderly system poweroff
1868 * @force: force poweroff if command execution fails
1869 *
1870 * This may be called from any context to trigger a system shutdown.
1871 * If the orderly shutdown fails, it will force an immediate shutdown.
1872 */
1873int orderly_poweroff(bool force)
2193{ 1874{
2194 int argc; 1875 int argc;
2195 char **argv; 1876 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2196 static char *envp[] = { 1877 static char *envp[] = {
2197 "HOME=/", 1878 "HOME=/",
2198 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", 1879 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2199 NULL 1880 NULL
2200 }; 1881 };
2201 int ret; 1882 int ret = -ENOMEM;
1883 struct subprocess_info *info;
2202 1884
2203 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2204 if (argv == NULL) { 1885 if (argv == NULL) {
2205 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 1886 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2206 __func__, poweroff_cmd); 1887 __func__, poweroff_cmd);
2207 return -ENOMEM; 1888 goto out;
2208 } 1889 }
2209 1890
2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 1891 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
2211 NULL, argv_cleanup, NULL); 1892 if (info == NULL) {
2212 if (ret == -ENOMEM)
2213 argv_free(argv); 1893 argv_free(argv);
1894 goto out;
1895 }
2214 1896
2215 return ret; 1897 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
2216}
2217 1898
2218/** 1899 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2219 * orderly_poweroff - Trigger an orderly system poweroff
2220 * @force: force poweroff if command execution fails
2221 *
2222 * This may be called from any context to trigger a system shutdown.
2223 * If the orderly shutdown fails, it will force an immediate shutdown.
2224 */
2225int orderly_poweroff(bool force)
2226{
2227 int ret = __orderly_poweroff();
2228 1900
1901 out:
2229 if (ret && force) { 1902 if (ret && force) {
2230 printk(KERN_WARNING "Failed to start orderly shutdown: " 1903 printk(KERN_WARNING "Failed to start orderly shutdown: "
2231 "forcing the issue\n"); 1904 "forcing the issue\n");
2232 1905
2233 /* 1906 /* I guess this should try to kick off some daemon to
2234 * I guess this should try to kick off some daemon to sync and 1907 sync and poweroff asap. Or not even bother syncing
2235 * poweroff asap. Or not even bother syncing if we're doing an 1908 if we're doing an emergency shutdown? */
2236 * emergency shutdown?
2237 */
2238 emergency_sync(); 1909 emergency_sync();
2239 kernel_power_off(); 1910 kernel_power_off();
2240 } 1911 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce1..a9a5de07c4f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,7 +25,6 @@ cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 25cond_syscall(sys_kexec_load);
26cond_syscall(compat_sys_kexec_load); 26cond_syscall(compat_sys_kexec_load);
27cond_syscall(sys_init_module); 27cond_syscall(sys_init_module);
28cond_syscall(sys_finit_module);
29cond_syscall(sys_delete_module); 28cond_syscall(sys_delete_module);
30cond_syscall(sys_socketpair); 29cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 30cond_syscall(sys_bind);
@@ -146,10 +145,6 @@ cond_syscall(sys_io_submit);
146cond_syscall(sys_io_cancel); 145cond_syscall(sys_io_cancel);
147cond_syscall(sys_io_getevents); 146cond_syscall(sys_io_getevents);
148cond_syscall(sys_syslog); 147cond_syscall(sys_syslog);
149cond_syscall(sys_process_vm_readv);
150cond_syscall(sys_process_vm_writev);
151cond_syscall(compat_sys_process_vm_readv);
152cond_syscall(compat_sys_process_vm_writev);
153 148
154/* arch-specific weak syscall entries */ 149/* arch-specific weak syscall entries */
155cond_syscall(sys_pciconfig_read); 150cond_syscall(sys_pciconfig_read);
@@ -204,6 +199,3 @@ cond_syscall(sys_fanotify_mark);
204cond_syscall(sys_name_to_handle_at); 199cond_syscall(sys_name_to_handle_at);
205cond_syscall(sys_open_by_handle_at); 200cond_syscall(sys_open_by_handle_at);
206cond_syscall(compat_sys_open_by_handle_at); 201cond_syscall(compat_sys_open_by_handle_at);
207
208/* compare kernel pointers */
209cond_syscall(sys_kcmp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491..fd15163f360 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,14 +23,12 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/bitmap.h>
27#include <linux/signal.h> 26#include <linux/signal.h>
28#include <linux/printk.h> 27#include <linux/printk.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/security.h> 29#include <linux/security.h>
31#include <linux/ctype.h> 30#include <linux/ctype.h>
32#include <linux/kmemcheck.h> 31#include <linux/kmemcheck.h>
33#include <linux/kmemleak.h>
34#include <linux/fs.h> 32#include <linux/fs.h>
35#include <linux/init.h> 33#include <linux/init.h>
36#include <linux/kernel.h> 34#include <linux/kernel.h>
@@ -59,8 +57,6 @@
59#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
60#include <linux/oom.h> 58#include <linux/oom.h>
61#include <linux/kmod.h> 59#include <linux/kmod.h>
62#include <linux/capability.h>
63#include <linux/binfmts.h>
64 60
65#include <asm/uaccess.h> 61#include <asm/uaccess.h>
66#include <asm/processor.h> 62#include <asm/processor.h>
@@ -70,9 +66,6 @@
70#include <asm/stacktrace.h> 66#include <asm/stacktrace.h>
71#include <asm/io.h> 67#include <asm/io.h>
72#endif 68#endif
73#ifdef CONFIG_SPARC
74#include <asm/setup.h>
75#endif
76#ifdef CONFIG_BSD_PROCESS_ACCT 69#ifdef CONFIG_BSD_PROCESS_ACCT
77#include <linux/acct.h> 70#include <linux/acct.h>
78#endif 71#endif
@@ -97,14 +90,13 @@
97extern int sysctl_overcommit_memory; 90extern int sysctl_overcommit_memory;
98extern int sysctl_overcommit_ratio; 91extern int sysctl_overcommit_ratio;
99extern int max_threads; 92extern int max_threads;
100extern int suid_dumpable;
101#ifdef CONFIG_COREDUMP
102extern int core_uses_pid; 93extern int core_uses_pid;
94extern int suid_dumpable;
103extern char core_pattern[]; 95extern char core_pattern[];
104extern unsigned int core_pipe_limit; 96extern unsigned int core_pipe_limit;
105#endif
106extern int pid_max; 97extern int pid_max;
107extern int min_free_kbytes; 98extern int min_free_kbytes;
99extern int min_free_order_shift;
108extern int pid_max_min, pid_max_max; 100extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches; 101extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 102extern int percpu_pagelist_fraction;
@@ -143,12 +135,12 @@ static int minolduid;
143static int min_percpu_pagelist_fract = 8; 135static int min_percpu_pagelist_fract = 8;
144 136
145static int ngroups_max = NGROUPS_MAX; 137static int ngroups_max = NGROUPS_MAX;
146static const int cap_last_cap = CAP_LAST_CAP;
147 138
148#ifdef CONFIG_INOTIFY_USER 139#ifdef CONFIG_INOTIFY_USER
149#include <linux/inotify.h> 140#include <linux/inotify.h>
150#endif 141#endif
151#ifdef CONFIG_SPARC 142#ifdef CONFIG_SPARC
143#include <asm/system.h>
152#endif 144#endif
153 145
154#ifdef CONFIG_SPARC64 146#ifdef CONFIG_SPARC64
@@ -160,6 +152,14 @@ extern int pwrsw_enabled;
160extern int unaligned_enabled; 152extern int unaligned_enabled;
161#endif 153#endif
162 154
155#ifdef CONFIG_S390
156#ifdef CONFIG_MATHEMU
157extern int sysctl_ieee_emulation_warnings;
158#endif
159extern int sysctl_userprocess_debug;
160extern int spin_retry;
161#endif
162
163#ifdef CONFIG_IA64 163#ifdef CONFIG_IA64
164extern int no_unaligned_warning; 164extern int no_unaligned_warning;
165extern int unaligned_dump_stack; 165extern int unaligned_dump_stack;
@@ -173,17 +173,10 @@ static int proc_taint(struct ctl_table *table, int write,
173#endif 173#endif
174 174
175#ifdef CONFIG_PRINTK 175#ifdef CONFIG_PRINTK
176static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, 176static int proc_dmesg_restrict(struct ctl_table *table, int write,
177 void __user *buffer, size_t *lenp, loff_t *ppos); 177 void __user *buffer, size_t *lenp, loff_t *ppos);
178#endif 178#endif
179 179
180static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos);
182#ifdef CONFIG_COREDUMP
183static int proc_dostring_coredump(struct ctl_table *table, int write,
184 void __user *buffer, size_t *lenp, loff_t *ppos);
185#endif
186
187#ifdef CONFIG_MAGIC_SYSRQ 180#ifdef CONFIG_MAGIC_SYSRQ
188/* Note: sysrq code uses it's own private copy */ 181/* Note: sysrq code uses it's own private copy */
189static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 182static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -206,6 +199,20 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
206 199
207#endif 200#endif
208 201
202static struct ctl_table root_table[];
203static struct ctl_table_root sysctl_table_root;
204static struct ctl_table_header root_table_header = {
205 {{.count = 1,
206 .ctl_table = root_table,
207 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
208 .root = &sysctl_table_root,
209 .set = &sysctl_table_root.default_set,
210};
211static struct ctl_table_root sysctl_table_root = {
212 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
213 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
214};
215
209static struct ctl_table kern_table[]; 216static struct ctl_table kern_table[];
210static struct ctl_table vm_table[]; 217static struct ctl_table vm_table[];
211static struct ctl_table fs_table[]; 218static struct ctl_table fs_table[];
@@ -222,7 +229,7 @@ int sysctl_legacy_va_layout;
222 229
223/* The default sysctl tables: */ 230/* The default sysctl tables: */
224 231
225static struct ctl_table sysctl_base_table[] = { 232static struct ctl_table root_table[] = {
226 { 233 {
227 .procname = "kernel", 234 .procname = "kernel",
228 .mode = 0555, 235 .mode = 0555,
@@ -256,11 +263,9 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 264static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 265static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 266static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 267static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262#endif /* CONFIG_SMP */ 268#endif
263#endif /* CONFIG_SCHED_DEBUG */
264 269
265#ifdef CONFIG_COMPACTION 270#ifdef CONFIG_COMPACTION
266static int min_extfrag_threshold; 271static int min_extfrag_threshold;
@@ -303,7 +308,6 @@ static struct ctl_table kern_table[] = {
303 .extra1 = &min_wakeup_granularity_ns, 308 .extra1 = &min_wakeup_granularity_ns,
304 .extra2 = &max_wakeup_granularity_ns, 309 .extra2 = &max_wakeup_granularity_ns,
305 }, 310 },
306#ifdef CONFIG_SMP
307 { 311 {
308 .procname = "sched_tunable_scaling", 312 .procname = "sched_tunable_scaling",
309 .data = &sysctl_sched_tunable_scaling, 313 .data = &sysctl_sched_tunable_scaling,
@@ -314,7 +318,7 @@ static struct ctl_table kern_table[] = {
314 .extra2 = &max_sched_tunable_scaling, 318 .extra2 = &max_sched_tunable_scaling,
315 }, 319 },
316 { 320 {
317 .procname = "sched_migration_cost_ns", 321 .procname = "sched_migration_cost",
318 .data = &sysctl_sched_migration_cost, 322 .data = &sysctl_sched_migration_cost,
319 .maxlen = sizeof(unsigned int), 323 .maxlen = sizeof(unsigned int),
320 .mode = 0644, 324 .mode = 0644,
@@ -328,14 +332,14 @@ static struct ctl_table kern_table[] = {
328 .proc_handler = proc_dointvec, 332 .proc_handler = proc_dointvec,
329 }, 333 },
330 { 334 {
331 .procname = "sched_time_avg_ms", 335 .procname = "sched_time_avg",
332 .data = &sysctl_sched_time_avg, 336 .data = &sysctl_sched_time_avg,
333 .maxlen = sizeof(unsigned int), 337 .maxlen = sizeof(unsigned int),
334 .mode = 0644, 338 .mode = 0644,
335 .proc_handler = proc_dointvec, 339 .proc_handler = proc_dointvec,
336 }, 340 },
337 { 341 {
338 .procname = "sched_shares_window_ns", 342 .procname = "sched_shares_window",
339 .data = &sysctl_sched_shares_window, 343 .data = &sysctl_sched_shares_window,
340 .maxlen = sizeof(unsigned int), 344 .maxlen = sizeof(unsigned int),
341 .mode = 0644, 345 .mode = 0644,
@@ -350,45 +354,7 @@ static struct ctl_table kern_table[] = {
350 .extra1 = &zero, 354 .extra1 = &zero,
351 .extra2 = &one, 355 .extra2 = &one,
352 }, 356 },
353#endif /* CONFIG_SMP */ 357#endif
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_delay_ms",
357 .data = &sysctl_numa_balancing_scan_delay,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_min_ms",
364 .data = &sysctl_numa_balancing_scan_period_min,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369 {
370 .procname = "numa_balancing_scan_period_reset",
371 .data = &sysctl_numa_balancing_scan_period_reset,
372 .maxlen = sizeof(unsigned int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
377 .procname = "numa_balancing_scan_period_max_ms",
378 .data = &sysctl_numa_balancing_scan_period_max,
379 .maxlen = sizeof(unsigned int),
380 .mode = 0644,
381 .proc_handler = proc_dointvec,
382 },
383 {
384 .procname = "numa_balancing_scan_size_mb",
385 .data = &sysctl_numa_balancing_scan_size,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec,
389 },
390#endif /* CONFIG_NUMA_BALANCING */
391#endif /* CONFIG_SCHED_DEBUG */
392 { 358 {
393 .procname = "sched_rt_period_us", 359 .procname = "sched_rt_period_us",
394 .data = &sysctl_sched_rt_period, 360 .data = &sysctl_sched_rt_period,
@@ -414,16 +380,6 @@ static struct ctl_table kern_table[] = {
414 .extra2 = &one, 380 .extra2 = &one,
415 }, 381 },
416#endif 382#endif
417#ifdef CONFIG_CFS_BANDWIDTH
418 {
419 .procname = "sched_cfs_bandwidth_slice_us",
420 .data = &sysctl_sched_cfs_bandwidth_slice,
421 .maxlen = sizeof(unsigned int),
422 .mode = 0644,
423 .proc_handler = proc_dointvec_minmax,
424 .extra1 = &one,
425 },
426#endif
427#ifdef CONFIG_PROVE_LOCKING 383#ifdef CONFIG_PROVE_LOCKING
428 { 384 {
429 .procname = "prove_locking", 385 .procname = "prove_locking",
@@ -449,7 +405,6 @@ static struct ctl_table kern_table[] = {
449 .mode = 0644, 405 .mode = 0644,
450 .proc_handler = proc_dointvec, 406 .proc_handler = proc_dointvec,
451 }, 407 },
452#ifdef CONFIG_COREDUMP
453 { 408 {
454 .procname = "core_uses_pid", 409 .procname = "core_uses_pid",
455 .data = &core_uses_pid, 410 .data = &core_uses_pid,
@@ -462,7 +417,7 @@ static struct ctl_table kern_table[] = {
462 .data = core_pattern, 417 .data = core_pattern,
463 .maxlen = CORENAME_MAX_SIZE, 418 .maxlen = CORENAME_MAX_SIZE,
464 .mode = 0644, 419 .mode = 0644,
465 .proc_handler = proc_dostring_coredump, 420 .proc_handler = proc_dostring,
466 }, 421 },
467 { 422 {
468 .procname = "core_pipe_limit", 423 .procname = "core_pipe_limit",
@@ -471,7 +426,6 @@ static struct ctl_table kern_table[] = {
471 .mode = 0644, 426 .mode = 0644,
472 .proc_handler = proc_dointvec, 427 .proc_handler = proc_dointvec,
473 }, 428 },
474#endif
475#ifdef CONFIG_PROC_SYSCTL 429#ifdef CONFIG_PROC_SYSCTL
476 { 430 {
477 .procname = "tainted", 431 .procname = "tainted",
@@ -606,7 +560,7 @@ static struct ctl_table kern_table[] = {
606 .extra2 = &one, 560 .extra2 = &one,
607 }, 561 },
608#endif 562#endif
609 563#ifdef CONFIG_HOTPLUG
610 { 564 {
611 .procname = "hotplug", 565 .procname = "hotplug",
612 .data = &uevent_helper, 566 .data = &uevent_helper,
@@ -614,7 +568,7 @@ static struct ctl_table kern_table[] = {
614 .mode = 0644, 568 .mode = 0644,
615 .proc_handler = proc_dostring, 569 .proc_handler = proc_dostring,
616 }, 570 },
617 571#endif
618#ifdef CONFIG_CHR_DEV_SG 572#ifdef CONFIG_CHR_DEV_SG
619 { 573 {
620 .procname = "sg-big-buff", 574 .procname = "sg-big-buff",
@@ -756,7 +710,7 @@ static struct ctl_table kern_table[] = {
756 .data = &dmesg_restrict, 710 .data = &dmesg_restrict,
757 .maxlen = sizeof(int), 711 .maxlen = sizeof(int),
758 .mode = 0644, 712 .mode = 0644,
759 .proc_handler = proc_dointvec_minmax_sysadmin, 713 .proc_handler = proc_dointvec_minmax,
760 .extra1 = &zero, 714 .extra1 = &zero,
761 .extra2 = &one, 715 .extra2 = &one,
762 }, 716 },
@@ -765,7 +719,7 @@ static struct ctl_table kern_table[] = {
765 .data = &kptr_restrict, 719 .data = &kptr_restrict,
766 .maxlen = sizeof(int), 720 .maxlen = sizeof(int),
767 .mode = 0644, 721 .mode = 0644,
768 .proc_handler = proc_dointvec_minmax_sysadmin, 722 .proc_handler = proc_dmesg_restrict,
769 .extra1 = &zero, 723 .extra1 = &zero,
770 .extra2 = &two, 724 .extra2 = &two,
771 }, 725 },
@@ -777,13 +731,6 @@ static struct ctl_table kern_table[] = {
777 .mode = 0444, 731 .mode = 0444,
778 .proc_handler = proc_dointvec, 732 .proc_handler = proc_dointvec,
779 }, 733 },
780 {
781 .procname = "cap_last_cap",
782 .data = (void *)&cap_last_cap,
783 .maxlen = sizeof(int),
784 .mode = 0444,
785 .proc_handler = proc_dointvec,
786 },
787#if defined(CONFIG_LOCKUP_DETECTOR) 734#if defined(CONFIG_LOCKUP_DETECTOR)
788 { 735 {
789 .procname = "watchdog", 736 .procname = "watchdog",
@@ -846,15 +793,6 @@ static struct ctl_table kern_table[] = {
846 .mode = 0644, 793 .mode = 0644,
847 .proc_handler = proc_dointvec, 794 .proc_handler = proc_dointvec,
848 }, 795 },
849#ifdef CONFIG_DEBUG_STACKOVERFLOW
850 {
851 .procname = "panic_on_stackoverflow",
852 .data = &sysctl_panic_on_stackoverflow,
853 .maxlen = sizeof(int),
854 .mode = 0644,
855 .proc_handler = proc_dointvec,
856 },
857#endif
858 { 796 {
859 .procname = "bootloader_type", 797 .procname = "bootloader_type",
860 .data = &bootloader_type, 798 .data = &bootloader_type,
@@ -1148,9 +1086,11 @@ static struct ctl_table vm_table[] = {
1148 .extra1 = &zero, 1086 .extra1 = &zero,
1149 }, 1087 },
1150 { 1088 {
1151 .procname = "nr_pdflush_threads", 1089 .procname = "nr_pdflush_threads",
1152 .mode = 0444 /* read-only */, 1090 .data = &nr_pdflush_threads,
1153 .proc_handler = pdflush_proc_obsolete, 1091 .maxlen = sizeof nr_pdflush_threads,
1092 .mode = 0444 /* read-only*/,
1093 .proc_handler = proc_dointvec,
1154 }, 1094 },
1155 { 1095 {
1156 .procname = "swappiness", 1096 .procname = "swappiness",
@@ -1250,6 +1190,13 @@ static struct ctl_table vm_table[] = {
1250 .extra1 = &zero, 1190 .extra1 = &zero,
1251 }, 1191 },
1252 { 1192 {
1193 .procname = "min_free_order_shift",
1194 .data = &min_free_order_shift,
1195 .maxlen = sizeof(min_free_order_shift),
1196 .mode = 0644,
1197 .proc_handler = &proc_dointvec
1198 },
1199 {
1253 .procname = "percpu_pagelist_fraction", 1200 .procname = "percpu_pagelist_fraction",
1254 .data = &percpu_pagelist_fraction, 1201 .data = &percpu_pagelist_fraction,
1255 .maxlen = sizeof(percpu_pagelist_fraction), 1202 .maxlen = sizeof(percpu_pagelist_fraction),
@@ -1545,29 +1492,11 @@ static struct ctl_table fs_table[] = {
1545#endif 1492#endif
1546#endif 1493#endif
1547 { 1494 {
1548 .procname = "protected_symlinks",
1549 .data = &sysctl_protected_symlinks,
1550 .maxlen = sizeof(int),
1551 .mode = 0600,
1552 .proc_handler = proc_dointvec_minmax,
1553 .extra1 = &zero,
1554 .extra2 = &one,
1555 },
1556 {
1557 .procname = "protected_hardlinks",
1558 .data = &sysctl_protected_hardlinks,
1559 .maxlen = sizeof(int),
1560 .mode = 0600,
1561 .proc_handler = proc_dointvec_minmax,
1562 .extra1 = &zero,
1563 .extra2 = &one,
1564 },
1565 {
1566 .procname = "suid_dumpable", 1495 .procname = "suid_dumpable",
1567 .data = &suid_dumpable, 1496 .data = &suid_dumpable,
1568 .maxlen = sizeof(int), 1497 .maxlen = sizeof(int),
1569 .mode = 0644, 1498 .mode = 0644,
1570 .proc_handler = proc_dointvec_minmax_coredump, 1499 .proc_handler = proc_dointvec_minmax,
1571 .extra1 = &zero, 1500 .extra1 = &zero,
1572 .extra2 = &two, 1501 .extra2 = &two,
1573 }, 1502 },
@@ -1590,7 +1519,8 @@ static struct ctl_table fs_table[] = {
1590}; 1519};
1591 1520
1592static struct ctl_table debug_table[] = { 1521static struct ctl_table debug_table[] = {
1593#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE 1522#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1523 defined(CONFIG_S390) || defined(CONFIG_TILE)
1594 { 1524 {
1595 .procname = "exception-trace", 1525 .procname = "exception-trace",
1596 .data = &show_unhandled_signals, 1526 .data = &show_unhandled_signals,
@@ -1617,15 +1547,490 @@ static struct ctl_table dev_table[] = {
1617 { } 1547 { }
1618}; 1548};
1619 1549
1620int __init sysctl_init(void) 1550static DEFINE_SPINLOCK(sysctl_lock);
1551
1552/* called under sysctl_lock */
1553static int use_table(struct ctl_table_header *p)
1554{
1555 if (unlikely(p->unregistering))
1556 return 0;
1557 p->used++;
1558 return 1;
1559}
1560
1561/* called under sysctl_lock */
1562static void unuse_table(struct ctl_table_header *p)
1563{
1564 if (!--p->used)
1565 if (unlikely(p->unregistering))
1566 complete(p->unregistering);
1567}
1568
1569/* called under sysctl_lock, will reacquire if has to wait */
1570static void start_unregistering(struct ctl_table_header *p)
1571{
1572 /*
1573 * if p->used is 0, nobody will ever touch that entry again;
1574 * we'll eliminate all paths to it before dropping sysctl_lock
1575 */
1576 if (unlikely(p->used)) {
1577 struct completion wait;
1578 init_completion(&wait);
1579 p->unregistering = &wait;
1580 spin_unlock(&sysctl_lock);
1581 wait_for_completion(&wait);
1582 spin_lock(&sysctl_lock);
1583 } else {
1584 /* anything non-NULL; we'll never dereference it */
1585 p->unregistering = ERR_PTR(-EINVAL);
1586 }
1587 /*
1588 * do not remove from the list until nobody holds it; walking the
1589 * list in do_sysctl() relies on that.
1590 */
1591 list_del_init(&p->ctl_entry);
1592}
1593
1594void sysctl_head_get(struct ctl_table_header *head)
1595{
1596 spin_lock(&sysctl_lock);
1597 head->count++;
1598 spin_unlock(&sysctl_lock);
1599}
1600
1601void sysctl_head_put(struct ctl_table_header *head)
1602{
1603 spin_lock(&sysctl_lock);
1604 if (!--head->count)
1605 kfree_rcu(head, rcu);
1606 spin_unlock(&sysctl_lock);
1607}
1608
1609struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1610{
1611 if (!head)
1612 BUG();
1613 spin_lock(&sysctl_lock);
1614 if (!use_table(head))
1615 head = ERR_PTR(-ENOENT);
1616 spin_unlock(&sysctl_lock);
1617 return head;
1618}
1619
1620void sysctl_head_finish(struct ctl_table_header *head)
1621{
1622 if (!head)
1623 return;
1624 spin_lock(&sysctl_lock);
1625 unuse_table(head);
1626 spin_unlock(&sysctl_lock);
1627}
1628
1629static struct ctl_table_set *
1630lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1631{
1632 struct ctl_table_set *set = &root->default_set;
1633 if (root->lookup)
1634 set = root->lookup(root, namespaces);
1635 return set;
1636}
1637
1638static struct list_head *
1639lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1640{
1641 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1642 return &set->list;
1643}
1644
1645struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1646 struct ctl_table_header *prev)
1647{
1648 struct ctl_table_root *root;
1649 struct list_head *header_list;
1650 struct ctl_table_header *head;
1651 struct list_head *tmp;
1652
1653 spin_lock(&sysctl_lock);
1654 if (prev) {
1655 head = prev;
1656 tmp = &prev->ctl_entry;
1657 unuse_table(prev);
1658 goto next;
1659 }
1660 tmp = &root_table_header.ctl_entry;
1661 for (;;) {
1662 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1663
1664 if (!use_table(head))
1665 goto next;
1666 spin_unlock(&sysctl_lock);
1667 return head;
1668 next:
1669 root = head->root;
1670 tmp = tmp->next;
1671 header_list = lookup_header_list(root, namespaces);
1672 if (tmp != header_list)
1673 continue;
1674
1675 do {
1676 root = list_entry(root->root_list.next,
1677 struct ctl_table_root, root_list);
1678 if (root == &sysctl_table_root)
1679 goto out;
1680 header_list = lookup_header_list(root, namespaces);
1681 } while (list_empty(header_list));
1682 tmp = header_list->next;
1683 }
1684out:
1685 spin_unlock(&sysctl_lock);
1686 return NULL;
1687}
1688
1689struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1690{
1691 return __sysctl_head_next(current->nsproxy, prev);
1692}
1693
1694void register_sysctl_root(struct ctl_table_root *root)
1695{
1696 spin_lock(&sysctl_lock);
1697 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1698 spin_unlock(&sysctl_lock);
1699}
1700
1701/*
1702 * sysctl_perm does NOT grant the superuser all rights automatically, because
1703 * some sysctl variables are readonly even to root.
1704 */
1705
1706static int test_perm(int mode, int op)
1707{
1708 if (!current_euid())
1709 mode >>= 6;
1710 else if (in_egroup_p(0))
1711 mode >>= 3;
1712 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1713 return 0;
1714 return -EACCES;
1715}
1716
1717int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1718{
1719 int mode;
1720
1721 if (root->permissions)
1722 mode = root->permissions(root, current->nsproxy, table);
1723 else
1724 mode = table->mode;
1725
1726 return test_perm(mode, op);
1727}
1728
1729static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1621{ 1730{
1622 struct ctl_table_header *hdr; 1731 for (; table->procname; table++) {
1732 table->parent = parent;
1733 if (table->child)
1734 sysctl_set_parent(table, table->child);
1735 }
1736}
1623 1737
1624 hdr = register_sysctl_table(sysctl_base_table); 1738static __init int sysctl_init(void)
1625 kmemleak_not_leak(hdr); 1739{
1740 sysctl_set_parent(NULL, root_table);
1741#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1742 sysctl_check_table(current->nsproxy, root_table);
1743#endif
1626 return 0; 1744 return 0;
1627} 1745}
1628 1746
1747core_initcall(sysctl_init);
1748
1749static struct ctl_table *is_branch_in(struct ctl_table *branch,
1750 struct ctl_table *table)
1751{
1752 struct ctl_table *p;
1753 const char *s = branch->procname;
1754
1755 /* branch should have named subdirectory as its first element */
1756 if (!s || !branch->child)
1757 return NULL;
1758
1759 /* ... and nothing else */
1760 if (branch[1].procname)
1761 return NULL;
1762
1763 /* table should contain subdirectory with the same name */
1764 for (p = table; p->procname; p++) {
1765 if (!p->child)
1766 continue;
1767 if (p->procname && strcmp(p->procname, s) == 0)
1768 return p;
1769 }
1770 return NULL;
1771}
1772
1773/* see if attaching q to p would be an improvement */
1774static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1775{
1776 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1777 struct ctl_table *next;
1778 int is_better = 0;
1779 int not_in_parent = !p->attached_by;
1780
1781 while ((next = is_branch_in(by, to)) != NULL) {
1782 if (by == q->attached_by)
1783 is_better = 1;
1784 if (to == p->attached_by)
1785 not_in_parent = 1;
1786 by = by->child;
1787 to = next->child;
1788 }
1789
1790 if (is_better && not_in_parent) {
1791 q->attached_by = by;
1792 q->attached_to = to;
1793 q->parent = p;
1794 }
1795}
1796
1797/**
1798 * __register_sysctl_paths - register a sysctl hierarchy
1799 * @root: List of sysctl headers to register on
1800 * @namespaces: Data to compute which lists of sysctl entries are visible
1801 * @path: The path to the directory the sysctl table is in.
1802 * @table: the top-level table structure
1803 *
1804 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1805 * array. A completely 0 filled entry terminates the table.
1806 *
1807 * The members of the &struct ctl_table structure are used as follows:
1808 *
1809 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1810 * enter a sysctl file
1811 *
1812 * data - a pointer to data for use by proc_handler
1813 *
1814 * maxlen - the maximum size in bytes of the data
1815 *
1816 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1817 *
1818 * child - a pointer to the child sysctl table if this entry is a directory, or
1819 * %NULL.
1820 *
1821 * proc_handler - the text handler routine (described below)
1822 *
1823 * de - for internal use by the sysctl routines
1824 *
1825 * extra1, extra2 - extra pointers usable by the proc handler routines
1826 *
1827 * Leaf nodes in the sysctl tree will be represented by a single file
1828 * under /proc; non-leaf nodes will be represented by directories.
1829 *
1830 * sysctl(2) can automatically manage read and write requests through
1831 * the sysctl table. The data and maxlen fields of the ctl_table
1832 * struct enable minimal validation of the values being written to be
1833 * performed, and the mode field allows minimal authentication.
1834 *
1835 * There must be a proc_handler routine for any terminal nodes
1836 * mirrored under /proc/sys (non-terminals are handled by a built-in
1837 * directory handler). Several default handlers are available to
1838 * cover common cases -
1839 *
1840 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1841 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1842 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1843 *
1844 * It is the handler's job to read the input buffer from user memory
1845 * and process it. The handler should return 0 on success.
1846 *
1847 * This routine returns %NULL on a failure to register, and a pointer
1848 * to the table header on success.
1849 */
1850struct ctl_table_header *__register_sysctl_paths(
1851 struct ctl_table_root *root,
1852 struct nsproxy *namespaces,
1853 const struct ctl_path *path, struct ctl_table *table)
1854{
1855 struct ctl_table_header *header;
1856 struct ctl_table *new, **prevp;
1857 unsigned int n, npath;
1858 struct ctl_table_set *set;
1859
1860 /* Count the path components */
1861 for (npath = 0; path[npath].procname; ++npath)
1862 ;
1863
1864 /*
1865 * For each path component, allocate a 2-element ctl_table array.
1866 * The first array element will be filled with the sysctl entry
1867 * for this, the second will be the sentinel (procname == 0).
1868 *
1869 * We allocate everything in one go so that we don't have to
1870 * worry about freeing additional memory in unregister_sysctl_table.
1871 */
1872 header = kzalloc(sizeof(struct ctl_table_header) +
1873 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1874 if (!header)
1875 return NULL;
1876
1877 new = (struct ctl_table *) (header + 1);
1878
1879 /* Now connect the dots */
1880 prevp = &header->ctl_table;
1881 for (n = 0; n < npath; ++n, ++path) {
1882 /* Copy the procname */
1883 new->procname = path->procname;
1884 new->mode = 0555;
1885
1886 *prevp = new;
1887 prevp = &new->child;
1888
1889 new += 2;
1890 }
1891 *prevp = table;
1892 header->ctl_table_arg = table;
1893
1894 INIT_LIST_HEAD(&header->ctl_entry);
1895 header->used = 0;
1896 header->unregistering = NULL;
1897 header->root = root;
1898 sysctl_set_parent(NULL, header->ctl_table);
1899 header->count = 1;
1900#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1901 if (sysctl_check_table(namespaces, header->ctl_table)) {
1902 kfree(header);
1903 return NULL;
1904 }
1905#endif
1906 spin_lock(&sysctl_lock);
1907 header->set = lookup_header_set(root, namespaces);
1908 header->attached_by = header->ctl_table;
1909 header->attached_to = root_table;
1910 header->parent = &root_table_header;
1911 for (set = header->set; set; set = set->parent) {
1912 struct ctl_table_header *p;
1913 list_for_each_entry(p, &set->list, ctl_entry) {
1914 if (p->unregistering)
1915 continue;
1916 try_attach(p, header);
1917 }
1918 }
1919 header->parent->count++;
1920 list_add_tail(&header->ctl_entry, &header->set->list);
1921 spin_unlock(&sysctl_lock);
1922
1923 return header;
1924}
1925
1926/**
1927 * register_sysctl_table_path - register a sysctl table hierarchy
1928 * @path: The path to the directory the sysctl table is in.
1929 * @table: the top-level table structure
1930 *
1931 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1932 * array. A completely 0 filled entry terminates the table.
1933 *
1934 * See __register_sysctl_paths for more details.
1935 */
1936struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1937 struct ctl_table *table)
1938{
1939 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1940 path, table);
1941}
1942
1943/**
1944 * register_sysctl_table - register a sysctl table hierarchy
1945 * @table: the top-level table structure
1946 *
1947 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1948 * array. A completely 0 filled entry terminates the table.
1949 *
1950 * See register_sysctl_paths for more details.
1951 */
1952struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1953{
1954 static const struct ctl_path null_path[] = { {} };
1955
1956 return register_sysctl_paths(null_path, table);
1957}
1958
1959/**
1960 * unregister_sysctl_table - unregister a sysctl table hierarchy
1961 * @header: the header returned from register_sysctl_table
1962 *
1963 * Unregisters the sysctl table and all children. proc entries may not
1964 * actually be removed until they are no longer used by anyone.
1965 */
1966void unregister_sysctl_table(struct ctl_table_header * header)
1967{
1968 might_sleep();
1969
1970 if (header == NULL)
1971 return;
1972
1973 spin_lock(&sysctl_lock);
1974 start_unregistering(header);
1975 if (!--header->parent->count) {
1976 WARN_ON(1);
1977 kfree_rcu(header->parent, rcu);
1978 }
1979 if (!--header->count)
1980 kfree_rcu(header, rcu);
1981 spin_unlock(&sysctl_lock);
1982}
1983
1984int sysctl_is_seen(struct ctl_table_header *p)
1985{
1986 struct ctl_table_set *set = p->set;
1987 int res;
1988 spin_lock(&sysctl_lock);
1989 if (p->unregistering)
1990 res = 0;
1991 else if (!set->is_seen)
1992 res = 1;
1993 else
1994 res = set->is_seen(set);
1995 spin_unlock(&sysctl_lock);
1996 return res;
1997}
1998
1999void setup_sysctl_set(struct ctl_table_set *p,
2000 struct ctl_table_set *parent,
2001 int (*is_seen)(struct ctl_table_set *))
2002{
2003 INIT_LIST_HEAD(&p->list);
2004 p->parent = parent ? parent : &sysctl_table_root.default_set;
2005 p->is_seen = is_seen;
2006}
2007
2008#else /* !CONFIG_SYSCTL */
2009struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
2010{
2011 return NULL;
2012}
2013
2014struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
2015 struct ctl_table *table)
2016{
2017 return NULL;
2018}
2019
2020void unregister_sysctl_table(struct ctl_table_header * table)
2021{
2022}
2023
2024void setup_sysctl_set(struct ctl_table_set *p,
2025 struct ctl_table_set *parent,
2026 int (*is_seen)(struct ctl_table_set *))
2027{
2028}
2029
2030void sysctl_head_put(struct ctl_table_header *head)
2031{
2032}
2033
1629#endif /* CONFIG_SYSCTL */ 2034#endif /* CONFIG_SYSCTL */
1630 2035
1631/* 2036/*
@@ -2014,7 +2419,7 @@ static int proc_taint(struct ctl_table *table, int write,
2014} 2419}
2015 2420
2016#ifdef CONFIG_PRINTK 2421#ifdef CONFIG_PRINTK
2017static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, 2422static int proc_dmesg_restrict(struct ctl_table *table, int write,
2018 void __user *buffer, size_t *lenp, loff_t *ppos) 2423 void __user *buffer, size_t *lenp, loff_t *ppos)
2019{ 2424{
2020 if (write && !capable(CAP_SYS_ADMIN)) 2425 if (write && !capable(CAP_SYS_ADMIN))
@@ -2080,38 +2485,6 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2080 do_proc_dointvec_minmax_conv, &param); 2485 do_proc_dointvec_minmax_conv, &param);
2081} 2486}
2082 2487
2083static void validate_coredump_safety(void)
2084{
2085#ifdef CONFIG_COREDUMP
2086 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2087 core_pattern[0] != '/' && core_pattern[0] != '|') {
2088 printk(KERN_WARNING "Unsafe core_pattern used with "\
2089 "suid_dumpable=2. Pipe handler or fully qualified "\
2090 "core dump path required.\n");
2091 }
2092#endif
2093}
2094
2095static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2096 void __user *buffer, size_t *lenp, loff_t *ppos)
2097{
2098 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2099 if (!error)
2100 validate_coredump_safety();
2101 return error;
2102}
2103
2104#ifdef CONFIG_COREDUMP
2105static int proc_dostring_coredump(struct ctl_table *table, int write,
2106 void __user *buffer, size_t *lenp, loff_t *ppos)
2107{
2108 int error = proc_dostring(table, write, buffer, lenp, ppos);
2109 if (!error)
2110 validate_coredump_safety();
2111 return error;
2112}
2113#endif
2114
2115static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2488static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2116 void __user *buffer, 2489 void __user *buffer,
2117 size_t *lenp, loff_t *ppos, 2490 size_t *lenp, loff_t *ppos,
@@ -2499,7 +2872,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2499 } 2872 }
2500 } 2873 }
2501 2874
2502 bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); 2875 while (val_a <= val_b)
2876 set_bit(val_a++, tmp_bitmap);
2877
2503 first = 0; 2878 first = 0;
2504 proc_skip_char(&kbuf, &left, '\n'); 2879 proc_skip_char(&kbuf, &left, '\n');
2505 } 2880 }
@@ -2542,7 +2917,8 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2542 if (*ppos) 2917 if (*ppos)
2543 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); 2918 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2544 else 2919 else
2545 bitmap_copy(bitmap, tmp_bitmap, bitmap_len); 2920 memcpy(bitmap, tmp_bitmap,
2921 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2546 } 2922 }
2547 kfree(tmp_bitmap); 2923 kfree(tmp_bitmap);
2548 *lenp -= left; 2924 *lenp -= left;
@@ -2620,3 +2996,6 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
2620EXPORT_SYMBOL(proc_dostring); 2996EXPORT_SYMBOL(proc_dostring);
2621EXPORT_SYMBOL(proc_doulongvec_minmax); 2997EXPORT_SYMBOL(proc_doulongvec_minmax);
2622EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2998EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
2999EXPORT_SYMBOL(register_sysctl_table);
3000EXPORT_SYMBOL(register_sysctl_paths);
3001EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a638445050..2ce1b308672 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ 150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, 214 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, 215 { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, 216 { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
217 /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ 217 { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, 218 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, 219 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, 220 { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1344 goto out_putname;
1345 } 1345 }
1346 1346
1347 mnt = task_active_pid_ns(current)->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1350 if (IS_ERR(file))
diff --git a/kernel/task_work.c b/kernel/task_work.c
deleted file mode 100644
index 65bd3c92d6f..00000000000
--- a/kernel/task_work.c
+++ /dev/null
@@ -1,92 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/task_work.h>
3#include <linux/tracehook.h>
4
5static struct callback_head work_exited; /* all we need is ->next == NULL */
6
7int
8task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
9{
10 struct callback_head *head;
11
12 do {
13 head = ACCESS_ONCE(task->task_works);
14 if (unlikely(head == &work_exited))
15 return -ESRCH;
16 work->next = head;
17 } while (cmpxchg(&task->task_works, head, work) != head);
18
19 if (notify)
20 set_notify_resume(task);
21 return 0;
22}
23
24struct callback_head *
25task_work_cancel(struct task_struct *task, task_work_func_t func)
26{
27 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL;
29 unsigned long flags;
30 /*
31 * If cmpxchg() fails we continue without updating pprev.
32 * Either we raced with task_work_add() which added the
33 * new entry before this work, we will find it again. Or
34 * we raced with task_work_run(), *pprev == NULL/exited.
35 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 while ((work = ACCESS_ONCE(*pprev))) {
38 read_barrier_depends();
39 if (work->func != func)
40 pprev = &work->next;
41 else if (cmpxchg(pprev, work, work->next) == work)
42 break;
43 }
44 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
45
46 return work;
47}
48
49void task_work_run(void)
50{
51 struct task_struct *task = current;
52 struct callback_head *work, *head, *next;
53
54 for (;;) {
55 /*
56 * work->func() can do task_work_add(), do not set
57 * work_exited unless the list is empty.
58 */
59 do {
60 work = ACCESS_ONCE(task->task_works);
61 head = !work && (task->flags & PF_EXITING) ?
62 &work_exited : NULL;
63 } while (cmpxchg(&task->task_works, work, head) != work);
64
65 if (!work)
66 break;
67 /*
68 * Synchronize with task_work_cancel(). It can't remove
69 * the first entry == work, cmpxchg(task_works) should
70 * fail, but it can play with *work and other entries.
71 */
72 raw_spin_unlock_wait(&task->pi_lock);
73 smp_mb();
74
75 /* Reverse the list to run the works in fifo order */
76 head = NULL;
77 do {
78 next = work->next;
79 work->next = head;
80 head = work;
81 work = next;
82 } while (work);
83
84 work = head;
85 do {
86 next = work->next;
87 work->func(work);
88 work = next;
89 cond_resched();
90 } while (work);
91 }
92}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d3bd4..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,7 +27,6 @@
27#include <linux/cgroup.h> 27#include <linux/cgroup.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/pid_namespace.h>
31#include <net/genetlink.h> 30#include <net/genetlink.h>
32#include <linux/atomic.h> 31#include <linux/atomic.h>
33 32
@@ -175,9 +174,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 174 up_write(&listeners->sem);
176} 175}
177 176
178static void fill_stats(struct user_namespace *user_ns, 177static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct pid_namespace *pid_ns,
180 struct task_struct *tsk, struct taskstats *stats)
181{ 178{
182 memset(stats, 0, sizeof(*stats)); 179 memset(stats, 0, sizeof(*stats));
183 /* 180 /*
@@ -193,7 +190,7 @@ static void fill_stats(struct user_namespace *user_ns,
193 stats->version = TASKSTATS_VERSION; 190 stats->version = TASKSTATS_VERSION;
194 stats->nvcsw = tsk->nvcsw; 191 stats->nvcsw = tsk->nvcsw;
195 stats->nivcsw = tsk->nivcsw; 192 stats->nivcsw = tsk->nivcsw;
196 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 193 bacct_add_tsk(stats, tsk);
197 194
198 /* fill in extended acct fields */ 195 /* fill in extended acct fields */
199 xacct_add_tsk(stats, tsk); 196 xacct_add_tsk(stats, tsk);
@@ -210,7 +207,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
210 rcu_read_unlock(); 207 rcu_read_unlock();
211 if (!tsk) 208 if (!tsk)
212 return -ESRCH; 209 return -ESRCH;
213 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 210 fill_stats(tsk, stats);
214 put_task_struct(tsk); 211 put_task_struct(tsk);
215 return 0; 212 return 0;
216} 213}
@@ -294,12 +291,6 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294 if (!cpumask_subset(mask, cpu_possible_mask)) 291 if (!cpumask_subset(mask, cpu_possible_mask))
295 return -EINVAL; 292 return -EINVAL;
296 293
297 if (current_user_ns() != &init_user_ns)
298 return -EINVAL;
299
300 if (task_active_pid_ns(current) != &init_pid_ns)
301 return -EINVAL;
302
303 if (isadd == REGISTER) { 294 if (isadd == REGISTER) {
304 for_each_cpu(cpu, mask) { 295 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), 296 s = kmalloc_node(sizeof(struct listener),
@@ -424,15 +415,16 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
424 struct nlattr *na; 415 struct nlattr *na;
425 size_t size; 416 size_t size;
426 u32 fd; 417 u32 fd;
427 struct fd f; 418 struct file *file;
419 int fput_needed;
428 420
429 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 421 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
430 if (!na) 422 if (!na)
431 return -EINVAL; 423 return -EINVAL;
432 424
433 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 425 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
434 f = fdget(fd); 426 file = fget_light(fd, &fput_needed);
435 if (!f.file) 427 if (!file)
436 return 0; 428 return 0;
437 429
438 size = nla_total_size(sizeof(struct cgroupstats)); 430 size = nla_total_size(sizeof(struct cgroupstats));
@@ -444,16 +436,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
444 436
445 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
446 sizeof(struct cgroupstats)); 438 sizeof(struct cgroupstats));
447 if (na == NULL) {
448 nlmsg_free(rep_skb);
449 rc = -EMSGSIZE;
450 goto err;
451 }
452
453 stats = nla_data(na); 439 stats = nla_data(na);
454 memset(stats, 0, sizeof(*stats)); 440 memset(stats, 0, sizeof(*stats));
455 441
456 rc = cgroupstats_build(stats, f.file->f_dentry); 442 rc = cgroupstats_build(stats, file->f_dentry);
457 if (rc < 0) { 443 if (rc < 0) {
458 nlmsg_free(rep_skb); 444 nlmsg_free(rep_skb);
459 goto err; 445 goto err;
@@ -462,7 +448,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
462 rc = send_reply(rep_skb, info); 448 rc = send_reply(rep_skb, info);
463 449
464err: 450err:
465 fdput(f); 451 fput_light(file, fput_needed);
466 return rc; 452 return rc;
467} 453}
468 454
@@ -476,7 +462,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
476 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 462 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
477 if (rc < 0) 463 if (rc < 0)
478 goto out; 464 goto out;
479 rc = add_del_listener(info->snd_portid, mask, REGISTER); 465 rc = add_del_listener(info->snd_pid, mask, REGISTER);
480out: 466out:
481 free_cpumask_var(mask); 467 free_cpumask_var(mask);
482 return rc; 468 return rc;
@@ -492,7 +478,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
492 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 478 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
493 if (rc < 0) 479 if (rc < 0)
494 goto out; 480 goto out;
495 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 481 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
496out: 482out:
497 free_cpumask_var(mask); 483 free_cpumask_var(mask);
498 return rc; 484 return rc;
@@ -640,12 +626,11 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
640 if (rc < 0) 626 if (rc < 0)
641 return; 627 return;
642 628
643 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 629 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
644 task_pid_nr_ns(tsk, &init_pid_ns));
645 if (!stats) 630 if (!stats)
646 goto err; 631 goto err;
647 632
648 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 633 fill_stats(tsk, stats);
649 634
650 /* 635 /*
651 * Doesn't matter if tsk is the leader or the last group member leaving 636 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -653,8 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
653 if (!is_thread_group || !group_dead) 638 if (!is_thread_group || !group_dead)
654 goto send; 639 goto send;
655 640
656 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 641 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
657 task_tgid_nr_ns(tsk, &init_pid_ns));
658 if (!stats) 642 if (!stats)
659 goto err; 643 goto err;
660 644
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd2..d7760621452 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,10 +27,10 @@
27 * with nanosecond accuracy 27 * with nanosecond accuracy
28 */ 28 */
29 29
30#include <linux/export.h> 30#include <linux/module.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/timekeeper_internal.h> 33#include <linux/clocksource.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
@@ -163,6 +163,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
163 return error; 163 return error;
164 164
165 if (tz) { 165 if (tz) {
166 /* SMP safe, global irq locking makes it work. */
166 sys_tz = *tz; 167 sys_tz = *tz;
167 update_vsyscall_tz(); 168 update_vsyscall_tz();
168 if (firsttime) { 169 if (firsttime) {
@@ -172,7 +173,12 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
172 } 173 }
173 } 174 }
174 if (tv) 175 if (tv)
176 {
177 /* SMP safe, again the code in arch/foo/time.c should
178 * globally block out interrupts when it runs.
179 */
175 return do_settimeofday(tv); 180 return do_settimeofday(tv);
181 }
176 return 0; 182 return 0;
177} 183}
178 184
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db126..f06a8a36564 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,63 +1,6 @@
1# 1#
2# Timer subsystem related configuration options 2# Timer subsystem related configuration options
3# 3#
4
5# Options selectable by arch Kconfig
6
7# Watchdog function for clocksources to detect instabilities
8config CLOCKSOURCE_WATCHDOG
9 bool
10
11# Architecture has extra clocksource data
12config ARCH_CLOCKSOURCE_DATA
13 bool
14
15# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL
17 bool
18
19# Timekeeping vsyscall support
20config GENERIC_TIME_VSYSCALL_OLD
21 bool
22
23# ktime_t scalar 64bit nsec representation
24config KTIME_SCALAR
25 bool
26
27# Old style timekeeping
28config ARCH_USES_GETTIMEOFFSET
29 bool
30
31# The generic clock events infrastructure
32config GENERIC_CLOCKEVENTS
33 bool
34
35# Migration helper. Builds, but does not invoke
36config GENERIC_CLOCKEVENTS_BUILD
37 bool
38 default y
39 depends on GENERIC_CLOCKEVENTS
40
41# Clockevents broadcasting infrastructure
42config GENERIC_CLOCKEVENTS_BROADCAST
43 bool
44 depends on GENERIC_CLOCKEVENTS
45
46# Automatically adjust the min. reprogramming time for
47# clock event device
48config GENERIC_CLOCKEVENTS_MIN_ADJUST
49 bool
50
51# Generic update of CMOS clock
52config GENERIC_CMOS_UPDATE
53 bool
54
55if GENERIC_CLOCKEVENTS
56menu "Timers subsystem"
57
58# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
59# only related to the tick functionality. Oneshot clockevent devices
60# are supported independ of this.
61config TICK_ONESHOT 4config TICK_ONESHOT
62 bool 5 bool
63 6
@@ -79,5 +22,8 @@ config HIGH_RES_TIMERS
79 hardware is not capable then this option only increases 22 hardware is not capable then this option only increases
80 the size of the kernel image. 23 the size of the kernel image.
81 24
82endmenu 25config GENERIC_CLOCKEVENTS_BUILD
83endif 26 bool
27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
29
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab50..cae2ad7491b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o #alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b1294..8b70c76910a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,6 +37,7 @@
37static struct alarm_base { 37static struct alarm_base {
38 spinlock_t lock; 38 spinlock_t lock;
39 struct timerqueue_head timerqueue; 39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
40 ktime_t (*gettime)(void); 41 ktime_t (*gettime)(void);
41 clockid_t base_clockid; 42 clockid_t base_clockid;
42} alarm_bases[ALARM_NUMTYPE]; 43} alarm_bases[ALARM_NUMTYPE];
@@ -45,8 +46,6 @@ static struct alarm_base {
45static ktime_t freezer_delta; 46static ktime_t freezer_delta;
46static DEFINE_SPINLOCK(freezer_delta_lock); 47static DEFINE_SPINLOCK(freezer_delta_lock);
47 48
48static struct wakeup_source *ws;
49
50#ifdef CONFIG_RTC_CLASS 49#ifdef CONFIG_RTC_CLASS
51/* rtc timer and device for setting alarm wakeups at suspend */ 50/* rtc timer and device for setting alarm wakeups at suspend */
52static struct rtc_timer rtctimer; 51static struct rtc_timer rtctimer;
@@ -54,112 +53,108 @@ static struct rtc_device *rtcdev;
54static DEFINE_SPINLOCK(rtcdev_lock); 53static DEFINE_SPINLOCK(rtcdev_lock);
55 54
56/** 55/**
57 * alarmtimer_get_rtcdev - Return selected rtcdevice 56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
58 * 59 *
59 * This function returns the rtc device to use for wakealarms. 60 * This helper function checks to see if the rtc device can wake
60 * If one has not already been chosen, it checks to see if a 61 * from suspend.
61 * functional rtc device is available.
62 */ 62 */
63struct rtc_device *alarmtimer_get_rtcdev(void) 63static int has_wakealarm(struct device *dev, void *name_ptr)
64{ 64{
65 unsigned long flags; 65 struct rtc_device *candidate = to_rtc_device(dev);
66 struct rtc_device *ret;
67 66
68 spin_lock_irqsave(&rtcdev_lock, flags); 67 if (!candidate->ops->set_alarm)
69 ret = rtcdev; 68 return 0;
70 spin_unlock_irqrestore(&rtcdev_lock, flags); 69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71 71
72 return ret; 72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
73} 74}
74 75
75 76/**
76static int alarmtimer_rtc_add_device(struct device *dev, 77 * alarmtimer_get_rtcdev - Return selected rtcdevice
77 struct class_interface *class_intf) 78 *
79 * This function returns the rtc device to use for wakealarms.
80 * If one has not already been chosen, it checks to see if a
81 * functional rtc device is available.
82 */
83static struct rtc_device *alarmtimer_get_rtcdev(void)
78{ 84{
85 struct device *dev;
86 char *str;
79 unsigned long flags; 87 unsigned long flags;
80 struct rtc_device *rtc = to_rtc_device(dev); 88 struct rtc_device *ret;
81
82 if (rtcdev)
83 return -EBUSY;
84
85 if (!rtc->ops->set_alarm)
86 return -1;
87 if (!device_may_wakeup(rtc->dev.parent))
88 return -1;
89 89
90 spin_lock_irqsave(&rtcdev_lock, flags); 90 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) { 91 if (!rtcdev) {
92 rtcdev = rtc; 92 /* Find an rtc device and init the rtc_timer */
93 /* hold a reference so it doesn't go away */ 93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 get_device(dev); 94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
95 } 104 }
105 ret = rtcdev;
96 spin_unlock_irqrestore(&rtcdev_lock, flags); 106 spin_unlock_irqrestore(&rtcdev_lock, flags);
97 return 0;
98}
99
100static inline void alarmtimer_rtc_timer_init(void)
101{
102 rtc_timer_init(&rtctimer, NULL, NULL);
103}
104
105static struct class_interface alarmtimer_rtc_interface = {
106 .add_dev = &alarmtimer_rtc_add_device,
107};
108 107
109static int alarmtimer_rtc_interface_setup(void) 108 return ret;
110{
111 alarmtimer_rtc_interface.class = rtc_class;
112 return class_interface_register(&alarmtimer_rtc_interface);
113}
114static void alarmtimer_rtc_interface_remove(void)
115{
116 class_interface_unregister(&alarmtimer_rtc_interface);
117} 109}
118#else 110#else
119struct rtc_device *alarmtimer_get_rtcdev(void) 111#define alarmtimer_get_rtcdev() (0)
120{ 112#define rtcdev (0)
121 return NULL;
122}
123#define rtcdev (NULL)
124static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
125static inline void alarmtimer_rtc_interface_remove(void) { }
126static inline void alarmtimer_rtc_timer_init(void) { }
127#endif 113#endif
128 114
115
129/** 116/**
130 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue 117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
131 * @base: pointer to the base where the timer is being run 118 * @base: pointer to the base where the timer is being run
132 * @alarm: pointer to alarm being enqueued. 119 * @alarm: pointer to alarm being enqueued.
133 * 120 *
134 * Adds alarm to a alarm_base timerqueue 121 * Adds alarm to a alarm_base timerqueue and if necessary sets
122 * an hrtimer to run.
135 * 123 *
136 * Must hold base->lock when calling. 124 * Must hold base->lock when calling.
137 */ 125 */
138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
139{ 127{
140 if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
141 timerqueue_del(&base->timerqueue, &alarm->node);
142
143 timerqueue_add(&base->timerqueue, &alarm->node); 128 timerqueue_add(&base->timerqueue, &alarm->node);
144 alarm->state |= ALARMTIMER_STATE_ENQUEUED; 129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires,
132 HRTIMER_MODE_ABS);
133 }
145} 134}
146 135
147/** 136/**
148 * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue 137 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
149 * @base: pointer to the base where the timer is running 138 * @base: pointer to the base where the timer is running
150 * @alarm: pointer to alarm being removed 139 * @alarm: pointer to alarm being removed
151 * 140 *
152 * Removes alarm to a alarm_base timerqueue 141 * Removes alarm to a alarm_base timerqueue and if necessary sets
142 * a new timer to run.
153 * 143 *
154 * Must hold base->lock when calling. 144 * Must hold base->lock when calling.
155 */ 145 */
156static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) 146static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
157{ 147{
158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) 148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
159 return;
160 149
161 timerqueue_del(&base->timerqueue, &alarm->node); 150 timerqueue_del(&base->timerqueue, &alarm->node);
162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; 151 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue);
154 if (!next)
155 return;
156 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
157 }
163} 158}
164 159
165 160
@@ -174,23 +169,39 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
174 */ 169 */
175static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) 170static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
176{ 171{
177 struct alarm *alarm = container_of(timer, struct alarm, timer); 172 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
178 struct alarm_base *base = &alarm_bases[alarm->type]; 173 struct timerqueue_node *next;
179 unsigned long flags; 174 unsigned long flags;
175 ktime_t now;
180 int ret = HRTIMER_NORESTART; 176 int ret = HRTIMER_NORESTART;
181 int restart = ALARMTIMER_NORESTART;
182 177
183 spin_lock_irqsave(&base->lock, flags); 178 spin_lock_irqsave(&base->lock, flags);
184 alarmtimer_dequeue(base, alarm); 179 now = base->gettime();
185 spin_unlock_irqrestore(&base->lock, flags); 180 while ((next = timerqueue_getnext(&base->timerqueue))) {
181 struct alarm *alarm;
182 ktime_t expired = next->expires;
186 183
187 if (alarm->function) 184 if (expired.tv64 > now.tv64)
188 restart = alarm->function(alarm, base->gettime()); 185 break;
189 186
190 spin_lock_irqsave(&base->lock, flags); 187 alarm = container_of(next, struct alarm, node);
191 if (restart != ALARMTIMER_NORESTART) { 188
192 hrtimer_set_expires(&alarm->timer, alarm->node.expires); 189 timerqueue_del(&base->timerqueue, &alarm->node);
193 alarmtimer_enqueue(base, alarm); 190 alarm->enabled = 0;
191 /* Re-add periodic timers */
192 if (alarm->period.tv64) {
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function)
199 alarm->function(alarm);
200 spin_lock_irqsave(&base->lock, flags);
201 }
202
203 if (next) {
204 hrtimer_set_expires(&base->timer, next->expires);
194 ret = HRTIMER_RESTART; 205 ret = HRTIMER_RESTART;
195 } 206 }
196 spin_unlock_irqrestore(&base->lock, flags); 207 spin_unlock_irqrestore(&base->lock, flags);
@@ -217,14 +228,13 @@ static int alarmtimer_suspend(struct device *dev)
217 unsigned long flags; 228 unsigned long flags;
218 struct rtc_device *rtc; 229 struct rtc_device *rtc;
219 int i; 230 int i;
220 int ret;
221 231
222 spin_lock_irqsave(&freezer_delta_lock, flags); 232 spin_lock_irqsave(&freezer_delta_lock, flags);
223 min = freezer_delta; 233 min = freezer_delta;
224 freezer_delta = ktime_set(0, 0); 234 freezer_delta = ktime_set(0, 0);
225 spin_unlock_irqrestore(&freezer_delta_lock, flags); 235 spin_unlock_irqrestore(&freezer_delta_lock, flags);
226 236
227 rtc = alarmtimer_get_rtcdev(); 237 rtc = rtcdev;
228 /* If we have no rtcdev, just return */ 238 /* If we have no rtcdev, just return */
229 if (!rtc) 239 if (!rtc)
230 return 0; 240 return 0;
@@ -247,10 +257,8 @@ static int alarmtimer_suspend(struct device *dev)
247 if (min.tv64 == 0) 257 if (min.tv64 == 0)
248 return 0; 258 return 0;
249 259
250 if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { 260 /* XXX - Should we enforce a minimum sleep time? */
251 __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); 261 WARN_ON(min.tv64 < NSEC_PER_SEC);
252 return -EBUSY;
253 }
254 262
255 /* Setup an rtc timer to fire that far in the future */ 263 /* Setup an rtc timer to fire that far in the future */
256 rtc_timer_cancel(rtc, &rtctimer); 264 rtc_timer_cancel(rtc, &rtctimer);
@@ -258,11 +266,9 @@ static int alarmtimer_suspend(struct device *dev)
258 now = rtc_tm_to_ktime(tm); 266 now = rtc_tm_to_ktime(tm);
259 now = ktime_add(now, min); 267 now = ktime_add(now, min);
260 268
261 /* Set alarm, if in the past reject suspend briefly to handle */ 269 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
262 ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); 270
263 if (ret < 0) 271 return 0;
264 __pm_wakeup_event(ws, MSEC_PER_SEC);
265 return ret;
266} 272}
267#else 273#else
268static int alarmtimer_suspend(struct device *dev) 274static int alarmtimer_suspend(struct device *dev)
@@ -293,110 +299,53 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
293 * @function: callback that is run when the alarm fires 299 * @function: callback that is run when the alarm fires
294 */ 300 */
295void alarm_init(struct alarm *alarm, enum alarmtimer_type type, 301void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
296 enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) 302 void (*function)(struct alarm *))
297{ 303{
298 timerqueue_init(&alarm->node); 304 timerqueue_init(&alarm->node);
299 hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, 305 alarm->period = ktime_set(0, 0);
300 HRTIMER_MODE_ABS);
301 alarm->timer.function = alarmtimer_fired;
302 alarm->function = function; 306 alarm->function = function;
303 alarm->type = type; 307 alarm->type = type;
304 alarm->state = ALARMTIMER_STATE_INACTIVE; 308 alarm->enabled = 0;
305} 309}
306 310
307/** 311/**
308 * alarm_start - Sets an alarm to fire 312 * alarm_start - Sets an alarm to fire
309 * @alarm: ptr to alarm to set 313 * @alarm: ptr to alarm to set
310 * @start: time to run the alarm 314 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
311 */ 316 */
312int alarm_start(struct alarm *alarm, ktime_t start) 317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
313{ 318{
314 struct alarm_base *base = &alarm_bases[alarm->type]; 319 struct alarm_base *base = &alarm_bases[alarm->type];
315 unsigned long flags; 320 unsigned long flags;
316 int ret;
317 321
318 spin_lock_irqsave(&base->lock, flags); 322 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled)
324 alarmtimer_remove(base, alarm);
319 alarm->node.expires = start; 325 alarm->node.expires = start;
326 alarm->period = period;
320 alarmtimer_enqueue(base, alarm); 327 alarmtimer_enqueue(base, alarm);
321 ret = hrtimer_start(&alarm->timer, alarm->node.expires, 328 alarm->enabled = 1;
322 HRTIMER_MODE_ABS);
323 spin_unlock_irqrestore(&base->lock, flags); 329 spin_unlock_irqrestore(&base->lock, flags);
324 return ret;
325} 330}
326 331
327/** 332/**
328 * alarm_try_to_cancel - Tries to cancel an alarm timer 333 * alarm_cancel - Tries to cancel an alarm timer
329 * @alarm: ptr to alarm to be canceled 334 * @alarm: ptr to alarm to be canceled
330 *
331 * Returns 1 if the timer was canceled, 0 if it was not running,
332 * and -1 if the callback was running
333 */ 335 */
334int alarm_try_to_cancel(struct alarm *alarm) 336void alarm_cancel(struct alarm *alarm)
335{ 337{
336 struct alarm_base *base = &alarm_bases[alarm->type]; 338 struct alarm_base *base = &alarm_bases[alarm->type];
337 unsigned long flags; 339 unsigned long flags;
338 int ret;
339 340
340 spin_lock_irqsave(&base->lock, flags); 341 spin_lock_irqsave(&base->lock, flags);
341 ret = hrtimer_try_to_cancel(&alarm->timer); 342 if (alarm->enabled)
342 if (ret >= 0) 343 alarmtimer_remove(base, alarm);
343 alarmtimer_dequeue(base, alarm); 344 alarm->enabled = 0;
344 spin_unlock_irqrestore(&base->lock, flags); 345 spin_unlock_irqrestore(&base->lock, flags);
345 return ret;
346}
347
348
349/**
350 * alarm_cancel - Spins trying to cancel an alarm timer until it is done
351 * @alarm: ptr to alarm to be canceled
352 *
353 * Returns 1 if the timer was canceled, 0 if it was not active.
354 */
355int alarm_cancel(struct alarm *alarm)
356{
357 for (;;) {
358 int ret = alarm_try_to_cancel(alarm);
359 if (ret >= 0)
360 return ret;
361 cpu_relax();
362 }
363}
364
365
366u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
367{
368 u64 overrun = 1;
369 ktime_t delta;
370
371 delta = ktime_sub(now, alarm->node.expires);
372
373 if (delta.tv64 < 0)
374 return 0;
375
376 if (unlikely(delta.tv64 >= interval.tv64)) {
377 s64 incr = ktime_to_ns(interval);
378
379 overrun = ktime_divns(delta, incr);
380
381 alarm->node.expires = ktime_add_ns(alarm->node.expires,
382 incr*overrun);
383
384 if (alarm->node.expires.tv64 > now.tv64)
385 return overrun;
386 /*
387 * This (and the ktime_add() below) is the
388 * correction for exact:
389 */
390 overrun++;
391 }
392
393 alarm->node.expires = ktime_add(alarm->node.expires, interval);
394 return overrun;
395} 346}
396 347
397 348
398
399
400/** 349/**
401 * clock2alarm - helper that converts from clockid to alarmtypes 350 * clock2alarm - helper that converts from clockid to alarmtypes
402 * @clockid: clockid. 351 * @clockid: clockid.
@@ -416,21 +365,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
416 * 365 *
417 * Posix timer callback for expired alarm timers. 366 * Posix timer callback for expired alarm timers.
418 */ 367 */
419static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, 368static void alarm_handle_timer(struct alarm *alarm)
420 ktime_t now)
421{ 369{
422 struct k_itimer *ptr = container_of(alarm, struct k_itimer, 370 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
423 it.alarm.alarmtimer); 371 it.alarmtimer);
424 if (posix_timer_event(ptr, 0) != 0) 372 if (posix_timer_event(ptr, 0) != 0)
425 ptr->it_overrun++; 373 ptr->it_overrun++;
426
427 /* Re-add periodic timers */
428 if (ptr->it.alarm.interval.tv64) {
429 ptr->it_overrun += alarm_forward(alarm, now,
430 ptr->it.alarm.interval);
431 return ALARMTIMER_RESTART;
432 }
433 return ALARMTIMER_NORESTART;
434} 374}
435 375
436/** 376/**
@@ -487,7 +427,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
487 427
488 type = clock2alarm(new_timer->it_clock); 428 type = clock2alarm(new_timer->it_clock);
489 base = &alarm_bases[type]; 429 base = &alarm_bases[type];
490 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); 430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
491 return 0; 431 return 0;
492} 432}
493 433
@@ -504,9 +444,9 @@ static void alarm_timer_get(struct k_itimer *timr,
504 memset(cur_setting, 0, sizeof(struct itimerspec)); 444 memset(cur_setting, 0, sizeof(struct itimerspec));
505 445
506 cur_setting->it_interval = 446 cur_setting->it_interval =
507 ktime_to_timespec(timr->it.alarm.interval); 447 ktime_to_timespec(timr->it.alarmtimer.period);
508 cur_setting->it_value = 448 cur_setting->it_value =
509 ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); 449 ktime_to_timespec(timr->it.alarmtimer.node.expires);
510 return; 450 return;
511} 451}
512 452
@@ -521,9 +461,7 @@ static int alarm_timer_del(struct k_itimer *timr)
521 if (!rtcdev) 461 if (!rtcdev)
522 return -ENOTSUPP; 462 return -ENOTSUPP;
523 463
524 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) 464 alarm_cancel(&timr->it.alarmtimer);
525 return TIMER_RETRY;
526
527 return 0; 465 return 0;
528} 466}
529 467
@@ -543,17 +481,25 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
543 if (!rtcdev) 481 if (!rtcdev)
544 return -ENOTSUPP; 482 return -ENOTSUPP;
545 483
484 /*
485 * XXX HACK! Currently we can DOS a system if the interval
486 * period on alarmtimers is too small. Cap the interval here
487 * to 100us and solve this properly in a future patch! -jstultz
488 */
489 if ((new_setting->it_interval.tv_sec == 0) &&
490 (new_setting->it_interval.tv_nsec < 100000))
491 new_setting->it_interval.tv_nsec = 100000;
492
546 if (old_setting) 493 if (old_setting)
547 alarm_timer_get(timr, old_setting); 494 alarm_timer_get(timr, old_setting);
548 495
549 /* If the timer was already set, cancel it */ 496 /* If the timer was already set, cancel it */
550 if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) 497 alarm_cancel(&timr->it.alarmtimer);
551 return TIMER_RETRY;
552 498
553 /* start the timer */ 499 /* start the timer */
554 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 500 alarm_start(&timr->it.alarmtimer,
555 alarm_start(&timr->it.alarm.alarmtimer, 501 timespec_to_ktime(new_setting->it_value),
556 timespec_to_ktime(new_setting->it_value)); 502 timespec_to_ktime(new_setting->it_interval));
557 return 0; 503 return 0;
558} 504}
559 505
@@ -563,15 +509,13 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
563 * 509 *
564 * Wakes up the task that set the alarmtimer 510 * Wakes up the task that set the alarmtimer
565 */ 511 */
566static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, 512static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
567 ktime_t now)
568{ 513{
569 struct task_struct *task = (struct task_struct *)alarm->data; 514 struct task_struct *task = (struct task_struct *)alarm->data;
570 515
571 alarm->data = NULL; 516 alarm->data = NULL;
572 if (task) 517 if (task)
573 wake_up_process(task); 518 wake_up_process(task);
574 return ALARMTIMER_NORESTART;
575} 519}
576 520
577/** 521/**
@@ -586,7 +530,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
586 alarm->data = (void *)current; 530 alarm->data = (void *)current;
587 do { 531 do {
588 set_current_state(TASK_INTERRUPTIBLE); 532 set_current_state(TASK_INTERRUPTIBLE);
589 alarm_start(alarm, absexp); 533 alarm_start(alarm, absexp, ktime_set(0, 0));
590 if (likely(alarm->data)) 534 if (likely(alarm->data))
591 schedule(); 535 schedule();
592 536
@@ -747,7 +691,6 @@ static struct platform_driver alarmtimer_driver = {
747 */ 691 */
748static int __init alarmtimer_init(void) 692static int __init alarmtimer_init(void)
749{ 693{
750 struct platform_device *pdev;
751 int error = 0; 694 int error = 0;
752 int i; 695 int i;
753 struct k_clock alarm_clock = { 696 struct k_clock alarm_clock = {
@@ -760,8 +703,6 @@ static int __init alarmtimer_init(void)
760 .nsleep = alarm_timer_nsleep, 703 .nsleep = alarm_timer_nsleep,
761 }; 704 };
762 705
763 alarmtimer_rtc_timer_init();
764
765 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); 706 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
766 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); 707 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
767 708
@@ -773,28 +714,15 @@ static int __init alarmtimer_init(void)
773 for (i = 0; i < ALARM_NUMTYPE; i++) { 714 for (i = 0; i < ALARM_NUMTYPE; i++) {
774 timerqueue_init_head(&alarm_bases[i].timerqueue); 715 timerqueue_init_head(&alarm_bases[i].timerqueue);
775 spin_lock_init(&alarm_bases[i].lock); 716 spin_lock_init(&alarm_bases[i].lock);
717 hrtimer_init(&alarm_bases[i].timer,
718 alarm_bases[i].base_clockid,
719 HRTIMER_MODE_ABS);
720 alarm_bases[i].timer.function = alarmtimer_fired;
776 } 721 }
777
778 error = alarmtimer_rtc_interface_setup();
779 if (error)
780 return error;
781
782 error = platform_driver_register(&alarmtimer_driver); 722 error = platform_driver_register(&alarmtimer_driver);
783 if (error) 723 platform_device_register_simple("alarmtimer", -1, NULL, 0);
784 goto out_if;
785
786 pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
787 if (IS_ERR(pdev)) {
788 error = PTR_ERR(pdev);
789 goto out_drv;
790 }
791 ws = wakeup_source_register("alarmtimer");
792 return 0;
793 724
794out_drv:
795 platform_driver_unregister(&alarmtimer_driver);
796out_if:
797 alarmtimer_rtc_interface_remove();
798 return error; 725 return error;
799} 726}
800device_initcall(alarmtimer_init); 727device_initcall(alarmtimer_init);
728
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977..e4c699dfa4e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h>
20 21
21#include "tick-internal.h" 22#include "tick-internal.h"
22 23
@@ -93,143 +94,42 @@ void clockevents_shutdown(struct clock_event_device *dev)
93 dev->next_event.tv64 = KTIME_MAX; 94 dev->next_event.tv64 = KTIME_MAX;
94} 95}
95 96
96#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
97
98/* Limit min_delta to a jiffie */
99#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
100
101/**
102 * clockevents_increase_min_delta - raise minimum delta of a clock event device
103 * @dev: device to increase the minimum delta
104 *
105 * Returns 0 on success, -ETIME when the minimum delta reached the limit.
106 */
107static int clockevents_increase_min_delta(struct clock_event_device *dev)
108{
109 /* Nothing to do if we already reached the limit */
110 if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
111 printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
112 dev->next_event.tv64 = KTIME_MAX;
113 return -ETIME;
114 }
115
116 if (dev->min_delta_ns < 5000)
117 dev->min_delta_ns = 5000;
118 else
119 dev->min_delta_ns += dev->min_delta_ns >> 1;
120
121 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
122 dev->min_delta_ns = MIN_DELTA_LIMIT;
123
124 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
125 dev->name ? dev->name : "?",
126 (unsigned long long) dev->min_delta_ns);
127 return 0;
128}
129
130/**
131 * clockevents_program_min_delta - Set clock event device to the minimum delay.
132 * @dev: device to program
133 *
134 * Returns 0 on success, -ETIME when the retry loop failed.
135 */
136static int clockevents_program_min_delta(struct clock_event_device *dev)
137{
138 unsigned long long clc;
139 int64_t delta;
140 int i;
141
142 for (i = 0;;) {
143 delta = dev->min_delta_ns;
144 dev->next_event = ktime_add_ns(ktime_get(), delta);
145
146 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
147 return 0;
148
149 dev->retries++;
150 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
151 if (dev->set_next_event((unsigned long) clc, dev) == 0)
152 return 0;
153
154 if (++i > 2) {
155 /*
156 * We tried 3 times to program the device with the
157 * given min_delta_ns. Try to increase the minimum
158 * delta, if that fails as well get out of here.
159 */
160 if (clockevents_increase_min_delta(dev))
161 return -ETIME;
162 i = 0;
163 }
164 }
165}
166
167#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
168
169/**
170 * clockevents_program_min_delta - Set clock event device to the minimum delay.
171 * @dev: device to program
172 *
173 * Returns 0 on success, -ETIME when the retry loop failed.
174 */
175static int clockevents_program_min_delta(struct clock_event_device *dev)
176{
177 unsigned long long clc;
178 int64_t delta;
179
180 delta = dev->min_delta_ns;
181 dev->next_event = ktime_add_ns(ktime_get(), delta);
182
183 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
184 return 0;
185
186 dev->retries++;
187 clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
188 return dev->set_next_event((unsigned long) clc, dev);
189}
190
191#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
192
193/** 97/**
194 * clockevents_program_event - Reprogram the clock event device. 98 * clockevents_program_event - Reprogram the clock event device.
195 * @dev: device to program
196 * @expires: absolute expiry time (monotonic clock) 99 * @expires: absolute expiry time (monotonic clock)
197 * @force: program minimum delay if expires can not be set
198 * 100 *
199 * Returns 0 on success, -ETIME when the event is in the past. 101 * Returns 0 on success, -ETIME when the event is in the past.
200 */ 102 */
201int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, 103int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
202 bool force) 104 ktime_t now)
203{ 105{
204 unsigned long long clc; 106 unsigned long long clc;
205 int64_t delta; 107 int64_t delta;
206 int rc;
207 108
208 if (unlikely(expires.tv64 < 0)) { 109 if (unlikely(expires.tv64 < 0)) {
209 WARN_ON_ONCE(1); 110 WARN_ON_ONCE(1);
210 return -ETIME; 111 return -ETIME;
211 } 112 }
212 113
114 delta = ktime_to_ns(ktime_sub(expires, now));
115
116 if (delta <= 0)
117 return -ETIME;
118
213 dev->next_event = expires; 119 dev->next_event = expires;
214 120
215 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 121 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
216 return 0; 122 return 0;
217 123
218 /* Shortcut for clockevent devices that can deal with ktime. */ 124 if (delta > dev->max_delta_ns)
219 if (dev->features & CLOCK_EVT_FEAT_KTIME) 125 delta = dev->max_delta_ns;
220 return dev->set_next_ktime(expires, dev); 126 if (delta < dev->min_delta_ns)
221 127 delta = dev->min_delta_ns;
222 delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
223 if (delta <= 0)
224 return force ? clockevents_program_min_delta(dev) : -ETIME;
225
226 delta = min(delta, (int64_t) dev->max_delta_ns);
227 delta = max(delta, (int64_t) dev->min_delta_ns);
228 128
229 clc = ((unsigned long long) delta * dev->mult) >> dev->shift; 129 clc = delta * dev->mult;
230 rc = dev->set_next_event((unsigned long) clc, dev); 130 clc >>= dev->shift;
231 131
232 return (rc && force) ? clockevents_program_min_delta(dev) : rc; 132 return dev->set_next_event((unsigned long) clc, dev);
233} 133}
234 134
235/** 135/**
@@ -297,7 +197,8 @@ void clockevents_register_device(struct clock_event_device *dev)
297} 197}
298EXPORT_SYMBOL_GPL(clockevents_register_device); 198EXPORT_SYMBOL_GPL(clockevents_register_device);
299 199
300void clockevents_config(struct clock_event_device *dev, u32 freq) 200static void clockevents_config(struct clock_event_device *dev,
201 u32 freq)
301{ 202{
302 u64 sec; 203 u64 sec;
303 204
@@ -357,7 +258,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
357 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
358 return 0; 259 return 0;
359 260
360 return clockevents_program_event(dev, dev->next_event, false); 261 return clockevents_program_event(dev, dev->next_event, ktime_get());
361} 262}
362 263
363/* 264/*
@@ -397,30 +298,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
397 local_irq_restore(flags); 298 local_irq_restore(flags);
398} 299}
399 300
400/**
401 * clockevents_suspend - suspend clock devices
402 */
403void clockevents_suspend(void)
404{
405 struct clock_event_device *dev;
406
407 list_for_each_entry_reverse(dev, &clockevent_devices, list)
408 if (dev->suspend)
409 dev->suspend(dev);
410}
411
412/**
413 * clockevents_resume - resume clock devices
414 */
415void clockevents_resume(void)
416{
417 struct clock_event_device *dev;
418
419 list_for_each_entry(dev, &clockevent_devices, list)
420 if (dev->resume)
421 dev->resume(dev);
422}
423
424#ifdef CONFIG_GENERIC_CLOCKEVENTS 301#ifdef CONFIG_GENERIC_CLOCKEVENTS
425/** 302/**
426 * clockevents_notify - notification about relevant events 303 * clockevents_notify - notification about relevant events
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141..8f77da18fef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 */ 24 */
25 25
26#include <linux/device.h>
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
27#include <linux/sysdev.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -186,7 +186,6 @@ static struct timer_list watchdog_timer;
186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
187static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
188static int watchdog_running; 188static int watchdog_running;
189static atomic_t watchdog_reset_pending;
190 189
191static int clocksource_watchdog_kthread(void *data); 190static int clocksource_watchdog_kthread(void *data);
192static void __clocksource_change_rating(struct clocksource *cs, int rating); 191static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -248,14 +247,12 @@ static void clocksource_watchdog(unsigned long data)
248 struct clocksource *cs; 247 struct clocksource *cs;
249 cycle_t csnow, wdnow; 248 cycle_t csnow, wdnow;
250 int64_t wd_nsec, cs_nsec; 249 int64_t wd_nsec, cs_nsec;
251 int next_cpu, reset_pending; 250 int next_cpu;
252 251
253 spin_lock(&watchdog_lock); 252 spin_lock(&watchdog_lock);
254 if (!watchdog_running) 253 if (!watchdog_running)
255 goto out; 254 goto out;
256 255
257 reset_pending = atomic_read(&watchdog_reset_pending);
258
259 list_for_each_entry(cs, &watchdog_list, wd_list) { 256 list_for_each_entry(cs, &watchdog_list, wd_list) {
260 257
261 /* Clocksource already marked unstable? */ 258 /* Clocksource already marked unstable? */
@@ -271,8 +268,7 @@ static void clocksource_watchdog(unsigned long data)
271 local_irq_enable(); 268 local_irq_enable();
272 269
273 /* Clocksource initialized ? */ 270 /* Clocksource initialized ? */
274 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || 271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
275 atomic_read(&watchdog_reset_pending)) {
276 cs->flags |= CLOCK_SOURCE_WATCHDOG; 272 cs->flags |= CLOCK_SOURCE_WATCHDOG;
277 cs->wd_last = wdnow; 273 cs->wd_last = wdnow;
278 cs->cs_last = csnow; 274 cs->cs_last = csnow;
@@ -287,11 +283,8 @@ static void clocksource_watchdog(unsigned long data)
287 cs->cs_last = csnow; 283 cs->cs_last = csnow;
288 cs->wd_last = wdnow; 284 cs->wd_last = wdnow;
289 285
290 if (atomic_read(&watchdog_reset_pending))
291 continue;
292
293 /* Check the deviation from the watchdog clocksource. */ 286 /* Check the deviation from the watchdog clocksource. */
294 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { 287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
295 clocksource_unstable(cs, cs_nsec - wd_nsec); 288 clocksource_unstable(cs, cs_nsec - wd_nsec);
296 continue; 289 continue;
297 } 290 }
@@ -310,13 +303,6 @@ static void clocksource_watchdog(unsigned long data)
310 } 303 }
311 304
312 /* 305 /*
313 * We only clear the watchdog_reset_pending, when we did a
314 * full cycle through all clocksources.
315 */
316 if (reset_pending)
317 atomic_dec(&watchdog_reset_pending);
318
319 /*
320 * Cycle through CPUs to check if the CPUs stay synchronized 306 * Cycle through CPUs to check if the CPUs stay synchronized
321 * to each other. 307 * to each other.
322 */ 308 */
@@ -358,7 +344,23 @@ static inline void clocksource_reset_watchdog(void)
358 344
359static void clocksource_resume_watchdog(void) 345static void clocksource_resume_watchdog(void)
360{ 346{
361 atomic_inc(&watchdog_reset_pending); 347 unsigned long flags;
348
349 /*
350 * We use trylock here to avoid a potential dead lock when
351 * kgdb calls this code after the kernel has been stopped with
352 * watchdog_lock held. When watchdog_lock is held we just
353 * return and accept, that the watchdog might trigger and mark
354 * the monitored clock source (usually TSC) unstable.
355 *
356 * This does not affect the other caller clocksource_resume()
357 * because at this point the kernel is UP, interrupts are
358 * disabled and nothing can hold watchdog_lock.
359 */
360 if (!spin_trylock_irqsave(&watchdog_lock, flags))
361 return;
362 clocksource_reset_watchdog();
363 spin_unlock_irqrestore(&watchdog_lock, flags);
362} 364}
363 365
364static void clocksource_enqueue_watchdog(struct clocksource *cs) 366static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -500,7 +502,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
500{ 502{
501 u64 ret; 503 u64 ret;
502 /* 504 /*
503 * We won't try to correct for more than 11% adjustments (110,000 ppm), 505 * We won't try to correct for more then 11% adjustments (110,000 ppm),
504 */ 506 */
505 ret = (u64)cs->mult * 11; 507 ret = (u64)cs->mult * 11;
506 do_div(ret,100); 508 do_div(ret,100);
@@ -647,7 +649,7 @@ static void clocksource_enqueue(struct clocksource *cs)
647 649
648/** 650/**
649 * __clocksource_updatefreq_scale - Used update clocksource with new freq 651 * __clocksource_updatefreq_scale - Used update clocksource with new freq
650 * @cs: clocksource to be registered 652 * @t: clocksource to be registered
651 * @scale: Scale factor multiplied against freq to get clocksource hz 653 * @scale: Scale factor multiplied against freq to get clocksource hz
652 * @freq: clocksource frequency (cycles per second) divided by scale 654 * @freq: clocksource frequency (cycles per second) divided by scale
653 * 655 *
@@ -699,7 +701,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
699 701
700/** 702/**
701 * __clocksource_register_scale - Used to install new clocksources 703 * __clocksource_register_scale - Used to install new clocksources
702 * @cs: clocksource to be registered 704 * @t: clocksource to be registered
703 * @scale: Scale factor multiplied against freq to get clocksource hz 705 * @scale: Scale factor multiplied against freq to get clocksource hz
704 * @freq: clocksource frequency (cycles per second) divided by scale 706 * @freq: clocksource frequency (cycles per second) divided by scale
705 * 707 *
@@ -727,7 +729,7 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
727 729
728/** 730/**
729 * clocksource_register - Used to install new clocksources 731 * clocksource_register - Used to install new clocksources
730 * @cs: clocksource to be registered 732 * @t: clocksource to be registered
731 * 733 *
732 * Returns -EBUSY if registration fails, zero otherwise. 734 * Returns -EBUSY if registration fails, zero otherwise.
733 */ 735 */
@@ -761,8 +763,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
761 763
762/** 764/**
763 * clocksource_change_rating - Change the rating of a registered clocksource 765 * clocksource_change_rating - Change the rating of a registered clocksource
764 * @cs: clocksource to be changed
765 * @rating: new rating
766 */ 766 */
767void clocksource_change_rating(struct clocksource *cs, int rating) 767void clocksource_change_rating(struct clocksource *cs, int rating)
768{ 768{
@@ -774,7 +774,6 @@ EXPORT_SYMBOL(clocksource_change_rating);
774 774
775/** 775/**
776 * clocksource_unregister - remove a registered clocksource 776 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered
778 */ 777 */
779void clocksource_unregister(struct clocksource *cs) 778void clocksource_unregister(struct clocksource *cs)
780{ 779{
@@ -790,14 +789,13 @@ EXPORT_SYMBOL(clocksource_unregister);
790/** 789/**
791 * sysfs_show_current_clocksources - sysfs interface for current clocksource 790 * sysfs_show_current_clocksources - sysfs interface for current clocksource
792 * @dev: unused 791 * @dev: unused
793 * @attr: unused
794 * @buf: char buffer to be filled with clocksource list 792 * @buf: char buffer to be filled with clocksource list
795 * 793 *
796 * Provides sysfs interface for listing current clocksource. 794 * Provides sysfs interface for listing current clocksource.
797 */ 795 */
798static ssize_t 796static ssize_t
799sysfs_show_current_clocksources(struct device *dev, 797sysfs_show_current_clocksources(struct sys_device *dev,
800 struct device_attribute *attr, char *buf) 798 struct sysdev_attribute *attr, char *buf)
801{ 799{
802 ssize_t count = 0; 800 ssize_t count = 0;
803 801
@@ -811,15 +809,14 @@ sysfs_show_current_clocksources(struct device *dev,
811/** 809/**
812 * sysfs_override_clocksource - interface for manually overriding clocksource 810 * sysfs_override_clocksource - interface for manually overriding clocksource
813 * @dev: unused 811 * @dev: unused
814 * @attr: unused
815 * @buf: name of override clocksource 812 * @buf: name of override clocksource
816 * @count: length of buffer 813 * @count: length of buffer
817 * 814 *
818 * Takes input from sysfs interface for manually overriding the default 815 * Takes input from sysfs interface for manually overriding the default
819 * clocksource selection. 816 * clocksource selection.
820 */ 817 */
821static ssize_t sysfs_override_clocksource(struct device *dev, 818static ssize_t sysfs_override_clocksource(struct sys_device *dev,
822 struct device_attribute *attr, 819 struct sysdev_attribute *attr,
823 const char *buf, size_t count) 820 const char *buf, size_t count)
824{ 821{
825 size_t ret = count; 822 size_t ret = count;
@@ -847,14 +844,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
847/** 844/**
848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 845 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
849 * @dev: unused 846 * @dev: unused
850 * @attr: unused
851 * @buf: char buffer to be filled with clocksource list 847 * @buf: char buffer to be filled with clocksource list
852 * 848 *
853 * Provides sysfs interface for listing registered clocksources 849 * Provides sysfs interface for listing registered clocksources
854 */ 850 */
855static ssize_t 851static ssize_t
856sysfs_show_available_clocksources(struct device *dev, 852sysfs_show_available_clocksources(struct sys_device *dev,
857 struct device_attribute *attr, 853 struct sysdev_attribute *attr,
858 char *buf) 854 char *buf)
859{ 855{
860 struct clocksource *src; 856 struct clocksource *src;
@@ -883,36 +879,35 @@ sysfs_show_available_clocksources(struct device *dev,
883/* 879/*
884 * Sysfs setup bits: 880 * Sysfs setup bits:
885 */ 881 */
886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 882static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 883 sysfs_override_clocksource);
888 884
889static DEVICE_ATTR(available_clocksource, 0444, 885static SYSDEV_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 886 sysfs_show_available_clocksources, NULL);
891 887
892static struct bus_type clocksource_subsys = { 888static struct sysdev_class clocksource_sysclass = {
893 .name = "clocksource", 889 .name = "clocksource",
894 .dev_name = "clocksource",
895}; 890};
896 891
897static struct device device_clocksource = { 892static struct sys_device device_clocksource = {
898 .id = 0, 893 .id = 0,
899 .bus = &clocksource_subsys, 894 .cls = &clocksource_sysclass,
900}; 895};
901 896
902static int __init init_clocksource_sysfs(void) 897static int __init init_clocksource_sysfs(void)
903{ 898{
904 int error = subsys_system_register(&clocksource_subsys, NULL); 899 int error = sysdev_class_register(&clocksource_sysclass);
905 900
906 if (!error) 901 if (!error)
907 error = device_register(&device_clocksource); 902 error = sysdev_register(&device_clocksource);
908 if (!error) 903 if (!error)
909 error = device_create_file( 904 error = sysdev_create_file(
910 &device_clocksource, 905 &device_clocksource,
911 &dev_attr_current_clocksource); 906 &attr_current_clocksource);
912 if (!error) 907 if (!error)
913 error = device_create_file( 908 error = sysdev_create_file(
914 &device_clocksource, 909 &device_clocksource,
915 &dev_attr_available_clocksource); 910 &attr_available_clocksource);
916 return error; 911 return error;
917} 912}
918 913
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba456f..a470154e040 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
58 return (cycle_t) jiffies; 58 return (cycle_t) jiffies;
59} 59}
60 60
61static struct clocksource clocksource_jiffies = { 61struct clocksource clocksource_jiffies = {
62 .name = "jiffies", 62 .name = "jiffies",
63 .rating = 1, /* lowest valid rating*/ 63 .rating = 1, /* lowest valid rating*/
64 .read = jiffies_read, 64 .read = jiffies_read,
@@ -67,8 +67,6 @@ static struct clocksource clocksource_jiffies = {
67 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
68}; 68};
69 69
70__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
71
72#if (BITS_PER_LONG < 64) 70#if (BITS_PER_LONG < 64)
73u64 get_jiffies_64(void) 71u64 get_jiffies_64(void)
74{ 72{
@@ -76,9 +74,9 @@ u64 get_jiffies_64(void)
76 u64 ret; 74 u64 ret;
77 75
78 do { 76 do {
79 seq = read_seqbegin(&jiffies_lock); 77 seq = read_seqbegin(&xtime_lock);
80 ret = jiffies_64; 78 ret = jiffies_64;
81 } while (read_seqretry(&jiffies_lock, seq)); 79 } while (read_seqretry(&xtime_lock, seq));
82 return ret; 80 return ret;
83} 81}
84EXPORT_SYMBOL(get_jiffies_64); 82EXPORT_SYMBOL(get_jiffies_64);
@@ -97,33 +95,3 @@ struct clocksource * __init __weak clocksource_default_clock(void)
97{ 95{
98 return &clocksource_jiffies; 96 return &clocksource_jiffies;
99} 97}
100
101struct clocksource refined_jiffies;
102
103int register_refined_jiffies(long cycles_per_second)
104{
105 u64 nsec_per_tick, shift_hz;
106 long cycles_per_tick;
107
108
109
110 refined_jiffies = clocksource_jiffies;
111 refined_jiffies.name = "refined-jiffies";
112 refined_jiffies.rating++;
113
114 /* Calc cycles per tick */
115 cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
116 /* shift_hz stores hz<<8 for extra accuracy */
117 shift_hz = (u64)cycles_per_second << 8;
118 shift_hz += cycles_per_tick/2;
119 do_div(shift_hz, cycles_per_tick);
120 /* Calculate nsec_per_tick using shift_hz */
121 nsec_per_tick = (u64)NSEC_PER_SEC << 8;
122 nsec_per_tick += (u32)shift_hz/2;
123 do_div(nsec_per_tick, (u32)shift_hz);
124
125 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
126
127 clocksource_register(&refined_jiffies);
128 return 0;
129}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669..f6117a4c7cb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,18 +22,17 @@
22 * NTP timekeeping variables: 22 * NTP timekeeping variables:
23 */ 23 */
24 24
25DEFINE_SPINLOCK(ntp_lock);
26
27
28/* USER_HZ period (usecs): */ 25/* USER_HZ period (usecs): */
29unsigned long tick_usec = TICK_USEC; 26unsigned long tick_usec = TICK_USEC;
30 27
31/* SHIFTED_HZ period (nsecs): */ 28/* ACTHZ period (nsecs): */
32unsigned long tick_nsec; 29unsigned long tick_nsec;
33 30
34static u64 tick_length; 31u64 tick_length;
35static u64 tick_length_base; 32static u64 tick_length_base;
36 33
34static struct hrtimer leap_timer;
35
37#define MAX_TICKADJ 500LL /* usecs */ 36#define MAX_TICKADJ 500LL /* usecs */
38#define MAX_TICKADJ_SCALED \ 37#define MAX_TICKADJ_SCALED \
39 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 38 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -50,7 +49,7 @@ static u64 tick_length_base;
50static int time_state = TIME_OK; 49static int time_state = TIME_OK;
51 50
52/* clock status bits: */ 51/* clock status bits: */
53static int time_status = STA_UNSYNC; 52int time_status = STA_UNSYNC;
54 53
55/* TAI offset (secs): */ 54/* TAI offset (secs): */
56static long time_tai; 55static long time_tai;
@@ -134,7 +133,7 @@ static inline void pps_reset_freq_interval(void)
134/** 133/**
135 * pps_clear - Clears the PPS state variables 134 * pps_clear - Clears the PPS state variables
136 * 135 *
137 * Must be called while holding a write on the ntp_lock 136 * Must be called while holding a write on the xtime_lock
138 */ 137 */
139static inline void pps_clear(void) 138static inline void pps_clear(void)
140{ 139{
@@ -150,7 +149,7 @@ static inline void pps_clear(void)
150 * the last PPS signal. When it reaches 0, indicate that PPS signal is 149 * the last PPS signal. When it reaches 0, indicate that PPS signal is
151 * missing. 150 * missing.
152 * 151 *
153 * Must be called while holding a write on the ntp_lock 152 * Must be called while holding a write on the xtime_lock
154 */ 153 */
155static inline void pps_dec_valid(void) 154static inline void pps_dec_valid(void)
156{ 155{
@@ -234,17 +233,6 @@ static inline void pps_fill_timex(struct timex *txc)
234 233
235#endif /* CONFIG_NTP_PPS */ 234#endif /* CONFIG_NTP_PPS */
236 235
237
238/**
239 * ntp_synced - Returns 1 if the NTP status is not UNSYNC
240 *
241 */
242static inline int ntp_synced(void)
243{
244 return !(time_status & STA_UNSYNC);
245}
246
247
248/* 236/*
249 * NTP methods: 237 * NTP methods:
250 */ 238 */
@@ -287,7 +275,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
287 275
288 time_status |= STA_MODE; 276 time_status |= STA_MODE;
289 277
290 return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); 278 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
291} 279}
292 280
293static void ntp_update_offset(long offset) 281static void ntp_update_offset(long offset)
@@ -342,13 +330,11 @@ static void ntp_update_offset(long offset)
342 330
343/** 331/**
344 * ntp_clear - Clears the NTP state variables 332 * ntp_clear - Clears the NTP state variables
333 *
334 * Must be called while holding a write on the xtime_lock
345 */ 335 */
346void ntp_clear(void) 336void ntp_clear(void)
347{ 337{
348 unsigned long flags;
349
350 spin_lock_irqsave(&ntp_lock, flags);
351
352 time_adjust = 0; /* stop active adjtime() */ 338 time_adjust = 0; /* stop active adjtime() */
353 time_status |= STA_UNSYNC; 339 time_status |= STA_UNSYNC;
354 time_maxerror = NTP_PHASE_LIMIT; 340 time_maxerror = NTP_PHASE_LIMIT;
@@ -361,85 +347,63 @@ void ntp_clear(void)
361 347
362 /* Clear PPS state variables */ 348 /* Clear PPS state variables */
363 pps_clear(); 349 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags);
365
366}
367
368
369u64 ntp_tick_length(void)
370{
371 unsigned long flags;
372 s64 ret;
373
374 spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret;
378} 350}
379 351
380
381/* 352/*
382 * this routine handles the overflow of the microsecond field 353 * Leap second processing. If in leap-insert state at the end of the
383 * 354 * day, the system clock is set back one second; if in leap-delete
384 * The tricky bits of code to handle the accurate clock support 355 * state, the system clock is set ahead one second.
385 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
386 * They were originally developed for SUN and DEC kernels.
387 * All the kudos should go to Dave for this stuff.
388 *
389 * Also handles leap second processing, and returns leap offset
390 */ 356 */
391int second_overflow(unsigned long secs) 357static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
392{ 358{
393 s64 delta; 359 enum hrtimer_restart res = HRTIMER_NORESTART;
394 int leap = 0;
395 unsigned long flags;
396 360
397 spin_lock_irqsave(&ntp_lock, flags); 361 write_seqlock(&xtime_lock);
398 362
399 /*
400 * Leap second processing. If in leap-insert state at the end of the
401 * day, the system clock is set back one second; if in leap-delete
402 * state, the system clock is set ahead one second.
403 */
404 switch (time_state) { 363 switch (time_state) {
405 case TIME_OK: 364 case TIME_OK:
406 if (time_status & STA_INS)
407 time_state = TIME_INS;
408 else if (time_status & STA_DEL)
409 time_state = TIME_DEL;
410 break; 365 break;
411 case TIME_INS: 366 case TIME_INS:
412 if (!(time_status & STA_INS)) 367 timekeeping_leap_insert(-1);
413 time_state = TIME_OK; 368 time_state = TIME_OOP;
414 else if (secs % 86400 == 0) { 369 printk(KERN_NOTICE
415 leap = -1; 370 "Clock: inserting leap second 23:59:60 UTC\n");
416 time_state = TIME_OOP; 371 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
417 time_tai++; 372 res = HRTIMER_RESTART;
418 printk(KERN_NOTICE
419 "Clock: inserting leap second 23:59:60 UTC\n");
420 }
421 break; 373 break;
422 case TIME_DEL: 374 case TIME_DEL:
423 if (!(time_status & STA_DEL)) 375 timekeeping_leap_insert(1);
424 time_state = TIME_OK; 376 time_tai--;
425 else if ((secs + 1) % 86400 == 0) { 377 time_state = TIME_WAIT;
426 leap = 1; 378 printk(KERN_NOTICE
427 time_tai--; 379 "Clock: deleting leap second 23:59:59 UTC\n");
428 time_state = TIME_WAIT;
429 printk(KERN_NOTICE
430 "Clock: deleting leap second 23:59:59 UTC\n");
431 }
432 break; 380 break;
433 case TIME_OOP: 381 case TIME_OOP:
382 time_tai++;
434 time_state = TIME_WAIT; 383 time_state = TIME_WAIT;
435 break; 384 /* fall through */
436
437 case TIME_WAIT: 385 case TIME_WAIT:
438 if (!(time_status & (STA_INS | STA_DEL))) 386 if (!(time_status & (STA_INS | STA_DEL)))
439 time_state = TIME_OK; 387 time_state = TIME_OK;
440 break; 388 break;
441 } 389 }
442 390
391 write_sequnlock(&xtime_lock);
392
393 return res;
394}
395
396/*
397 * this routine handles the overflow of the microsecond field
398 *
399 * The tricky bits of code to handle the accurate clock support
400 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
401 * They were originally developed for SUN and DEC kernels.
402 * All the kudos should go to Dave for this stuff.
403 */
404void second_overflow(void)
405{
406 s64 delta;
443 407
444 /* Bump the maxerror field */ 408 /* Bump the maxerror field */
445 time_maxerror += MAXFREQ / NSEC_PER_USEC; 409 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -459,32 +423,30 @@ int second_overflow(unsigned long secs)
459 pps_dec_valid(); 423 pps_dec_valid();
460 424
461 if (!time_adjust) 425 if (!time_adjust)
462 goto out; 426 return;
463 427
464 if (time_adjust > MAX_TICKADJ) { 428 if (time_adjust > MAX_TICKADJ) {
465 time_adjust -= MAX_TICKADJ; 429 time_adjust -= MAX_TICKADJ;
466 tick_length += MAX_TICKADJ_SCALED; 430 tick_length += MAX_TICKADJ_SCALED;
467 goto out; 431 return;
468 } 432 }
469 433
470 if (time_adjust < -MAX_TICKADJ) { 434 if (time_adjust < -MAX_TICKADJ) {
471 time_adjust += MAX_TICKADJ; 435 time_adjust += MAX_TICKADJ;
472 tick_length -= MAX_TICKADJ_SCALED; 436 tick_length -= MAX_TICKADJ_SCALED;
473 goto out; 437 return;
474 } 438 }
475 439
476 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) 440 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
477 << NTP_SCALE_SHIFT; 441 << NTP_SCALE_SHIFT;
478 time_adjust = 0; 442 time_adjust = 0;
479
480out:
481 spin_unlock_irqrestore(&ntp_lock, flags);
482
483 return leap;
484} 443}
485 444
486#ifdef CONFIG_GENERIC_CMOS_UPDATE 445#ifdef CONFIG_GENERIC_CMOS_UPDATE
487 446
447/* Disable the cmos update - used by virtualization and embedded */
448int no_sync_cmos_clock __read_mostly;
449
488static void sync_cmos_clock(struct work_struct *work); 450static void sync_cmos_clock(struct work_struct *work);
489 451
490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 452static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -531,13 +493,35 @@ static void sync_cmos_clock(struct work_struct *work)
531 493
532static void notify_cmos_timer(void) 494static void notify_cmos_timer(void)
533{ 495{
534 schedule_delayed_work(&sync_cmos_work, 0); 496 if (!no_sync_cmos_clock)
497 schedule_delayed_work(&sync_cmos_work, 0);
535} 498}
536 499
537#else 500#else
538static inline void notify_cmos_timer(void) { } 501static inline void notify_cmos_timer(void) { }
539#endif 502#endif
540 503
504/*
505 * Start the leap seconds timer:
506 */
507static inline void ntp_start_leap_timer(struct timespec *ts)
508{
509 long now = ts->tv_sec;
510
511 if (time_status & STA_INS) {
512 time_state = TIME_INS;
513 now += 86400 - now % 86400;
514 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
515
516 return;
517 }
518
519 if (time_status & STA_DEL) {
520 time_state = TIME_DEL;
521 now += 86400 - (now + 1) % 86400;
522 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
523 }
524}
541 525
542/* 526/*
543 * Propagate a new txc->status value into the NTP state: 527 * Propagate a new txc->status value into the NTP state:
@@ -561,10 +545,26 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
561 /* only set allowed bits */ 545 /* only set allowed bits */
562 time_status &= STA_RONLY; 546 time_status &= STA_RONLY;
563 time_status |= txc->status & ~STA_RONLY; 547 time_status |= txc->status & ~STA_RONLY;
564}
565 548
549 switch (time_state) {
550 case TIME_OK:
551 ntp_start_leap_timer(ts);
552 break;
553 case TIME_INS:
554 case TIME_DEL:
555 time_state = TIME_OK;
556 ntp_start_leap_timer(ts);
557 case TIME_WAIT:
558 if (!(time_status & (STA_INS | STA_DEL)))
559 time_state = TIME_OK;
560 break;
561 case TIME_OOP:
562 hrtimer_restart(&leap_timer);
563 break;
564 }
565}
566/* 566/*
567 * Called with ntp_lock held, so we can access and modify 567 * Called with the xtime lock held, so we can access and modify
568 * all the global NTP state: 568 * all the global NTP state:
569 */ 569 */
570static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) 570static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
@@ -643,6 +643,9 @@ int do_adjtimex(struct timex *txc)
643 (txc->tick < 900000/USER_HZ || 643 (txc->tick < 900000/USER_HZ ||
644 txc->tick > 1100000/USER_HZ)) 644 txc->tick > 1100000/USER_HZ))
645 return -EINVAL; 645 return -EINVAL;
646
647 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
648 hrtimer_cancel(&leap_timer);
646 } 649 }
647 650
648 if (txc->modes & ADJ_SETOFFSET) { 651 if (txc->modes & ADJ_SETOFFSET) {
@@ -660,7 +663,7 @@ int do_adjtimex(struct timex *txc)
660 663
661 getnstimeofday(&ts); 664 getnstimeofday(&ts);
662 665
663 spin_lock_irq(&ntp_lock); 666 write_seqlock_irq(&xtime_lock);
664 667
665 if (txc->modes & ADJ_ADJTIME) { 668 if (txc->modes & ADJ_ADJTIME) {
666 long save_adjust = time_adjust; 669 long save_adjust = time_adjust;
@@ -702,7 +705,7 @@ int do_adjtimex(struct timex *txc)
702 /* fill PPS status fields */ 705 /* fill PPS status fields */
703 pps_fill_timex(txc); 706 pps_fill_timex(txc);
704 707
705 spin_unlock_irq(&ntp_lock); 708 write_sequnlock_irq(&xtime_lock);
706 709
707 txc->time.tv_sec = ts.tv_sec; 710 txc->time.tv_sec = ts.tv_sec;
708 txc->time.tv_usec = ts.tv_nsec; 711 txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +903,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
900 903
901 pts_norm = pps_normalize_ts(*phase_ts); 904 pts_norm = pps_normalize_ts(*phase_ts);
902 905
903 spin_lock_irqsave(&ntp_lock, flags); 906 write_seqlock_irqsave(&xtime_lock, flags);
904 907
905 /* clear the error bits, they will be set again if needed */ 908 /* clear the error bits, they will be set again if needed */
906 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 909 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +916,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
913 * just start the frequency interval */ 916 * just start the frequency interval */
914 if (unlikely(pps_fbase.tv_sec == 0)) { 917 if (unlikely(pps_fbase.tv_sec == 0)) {
915 pps_fbase = *raw_ts; 918 pps_fbase = *raw_ts;
916 spin_unlock_irqrestore(&ntp_lock, flags); 919 write_sequnlock_irqrestore(&xtime_lock, flags);
917 return; 920 return;
918 } 921 }
919 922
@@ -928,7 +931,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
928 time_status |= STA_PPSJITTER; 931 time_status |= STA_PPSJITTER;
929 /* restart the frequency calibration interval */ 932 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts; 933 pps_fbase = *raw_ts;
931 spin_unlock_irqrestore(&ntp_lock, flags); 934 write_sequnlock_irqrestore(&xtime_lock, flags);
932 pr_err("hardpps: PPSJITTER: bad pulse\n"); 935 pr_err("hardpps: PPSJITTER: bad pulse\n");
933 return; 936 return;
934 } 937 }
@@ -945,7 +948,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
945 948
946 hardpps_update_phase(pts_norm.nsec); 949 hardpps_update_phase(pts_norm.nsec);
947 950
948 spin_unlock_irqrestore(&ntp_lock, flags); 951 write_sequnlock_irqrestore(&xtime_lock, flags);
949} 952}
950EXPORT_SYMBOL(hardpps); 953EXPORT_SYMBOL(hardpps);
951 954
@@ -964,4 +967,6 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
964void __init ntp_init(void) 967void __init ntp_init(void)
965{ 968{
966 ntp_clear(); 969 ntp_clear();
970 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
971 leap_timer.function = ntp_leap_second;
967} 972}
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ce033c7aa2e..c340ca658f3 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,7 +18,6 @@
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/export.h>
22#include <linux/file.h> 21#include <linux/file.h>
23#include <linux/posix-clock.h> 22#include <linux/posix-clock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e..7a90d021b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
194 for (next = dev->next_event; ;) { 194 for (next = dev->next_event; ;) {
195 next = ktime_add(next, tick_period); 195 next = ktime_add(next, tick_period);
196 196
197 if (!clockevents_program_event(dev, next, false)) 197 if (!clockevents_program_event(dev, next, ktime_get()))
198 return; 198 return;
199 tick_do_periodic_broadcast(); 199 tick_do_periodic_broadcast();
200 } 200 }
@@ -346,8 +346,7 @@ int tick_resume_broadcast(void)
346 tick_get_broadcast_mask()); 346 tick_get_broadcast_mask());
347 break; 347 break;
348 case TICKDEV_MODE_ONESHOT: 348 case TICKDEV_MODE_ONESHOT:
349 if (!cpumask_empty(tick_get_broadcast_mask())) 349 broadcast = tick_resume_broadcast_oneshot(bc);
350 broadcast = tick_resume_broadcast_oneshot(bc);
351 break; 350 break;
352 } 351 }
353 } 352 }
@@ -374,10 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
374{ 373{
375 struct clock_event_device *bc = tick_broadcast_device.evtdev; 374 struct clock_event_device *bc = tick_broadcast_device.evtdev;
376 375
377 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 376 return tick_dev_program_event(bc, expires, force);
378 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
379
380 return clockevents_program_event(bc, expires, force);
381} 377}
382 378
383int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 379int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -535,6 +531,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
535 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
536 532
537 bc->event_handler = tick_handle_oneshot_broadcast; 533 bc->event_handler = tick_handle_oneshot_broadcast;
534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
538 535
539 /* Take the do_timer update */ 536 /* Take the do_timer update */
540 tick_do_timer_cpu = cpu; 537 tick_do_timer_cpu = cpu;
@@ -552,7 +549,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
552 to_cpumask(tmpmask)); 549 to_cpumask(tmpmask));
553 550
554 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 551 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
555 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
556 tick_broadcast_init_next_event(to_cpumask(tmpmask), 552 tick_broadcast_init_next_event(to_cpumask(tmpmask),
557 tick_next_period); 553 tick_next_period);
558 tick_broadcast_set_event(tick_next_period, 1); 554 tick_broadcast_set_event(tick_next_period, 1);
@@ -584,7 +580,6 @@ void tick_broadcast_switch_to_oneshot(void)
584 bc = tick_broadcast_device.evtdev; 580 bc = tick_broadcast_device.evtdev;
585 if (bc) 581 if (bc)
586 tick_broadcast_setup_oneshot(bc); 582 tick_broadcast_setup_oneshot(bc);
587
588 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 583 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
589} 584}
590 585
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f..119528de823 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
63static void tick_periodic(int cpu) 63static void tick_periodic(int cpu)
64{ 64{
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&jiffies_lock); 66 write_seqlock(&xtime_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&jiffies_lock); 72 write_sequnlock(&xtime_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
94 */ 94 */
95 next = ktime_add(dev->next_event, tick_period); 95 next = ktime_add(dev->next_event, tick_period);
96 for (;;) { 96 for (;;) {
97 if (!clockevents_program_event(dev, next, false)) 97 if (!clockevents_program_event(dev, next, ktime_get()))
98 return; 98 return;
99 /* 99 /*
100 * Have to be careful here. If we're in oneshot mode, 100 * Have to be careful here. If we're in oneshot mode,
@@ -130,14 +130,14 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&jiffies_lock); 133 seq = read_seqbegin(&xtime_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&jiffies_lock, seq)); 135 } while (read_seqretry(&xtime_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
139 for (;;) { 139 for (;;) {
140 if (!clockevents_program_event(dev, next, false)) 140 if (!clockevents_program_event(dev, next, ktime_get()))
141 return; 141 return;
142 next = ktime_add(next, tick_period); 142 next = ktime_add(next, tick_period);
143 } 143 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc..1009b06d6f8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,6 +26,8 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
26extern void tick_setup_oneshot(struct clock_event_device *newdev, 26extern void tick_setup_oneshot(struct clock_event_device *newdev,
27 void (*handler)(struct clock_event_device *), 27 void (*handler)(struct clock_event_device *),
28 ktime_t nextevt); 28 ktime_t nextevt);
29extern int tick_dev_program_event(struct clock_event_device *dev,
30 ktime_t expires, int force);
29extern int tick_program_event(ktime_t expires, int force); 31extern int tick_program_event(ktime_t expires, int force);
30extern void tick_oneshot_notify(void); 32extern void tick_oneshot_notify(void);
31extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 33extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -141,3 +143,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
141#endif 143#endif
142 144
143extern void do_timer(unsigned long ticks); 145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a3..2d04411a5f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,6 +21,74 @@
21 21
22#include "tick-internal.h" 22#include "tick-internal.h"
23 23
24/* Limit min_delta to a jiffie */
25#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
26
27static int tick_increase_min_delta(struct clock_event_device *dev)
28{
29 /* Nothing to do if we already reached the limit */
30 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
31 return -ETIME;
32
33 if (dev->min_delta_ns < 5000)
34 dev->min_delta_ns = 5000;
35 else
36 dev->min_delta_ns += dev->min_delta_ns >> 1;
37
38 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
39 dev->min_delta_ns = MIN_DELTA_LIMIT;
40
41 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
42 dev->name ? dev->name : "?",
43 (unsigned long long) dev->min_delta_ns);
44 return 0;
45}
46
47/**
48 * tick_program_event internal worker function
49 */
50int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
51 int force)
52{
53 ktime_t now = ktime_get();
54 int i;
55
56 for (i = 0;;) {
57 int ret = clockevents_program_event(dev, expires, now);
58
59 if (!ret || !force)
60 return ret;
61
62 dev->retries++;
63 /*
64 * We tried 3 times to program the device with the given
65 * min_delta_ns. If that's not working then we increase it
66 * and emit a warning.
67 */
68 if (++i > 2) {
69 /* Increase the min. delta and try again */
70 if (tick_increase_min_delta(dev)) {
71 /*
72 * Get out of the loop if min_delta_ns
73 * hit the limit already. That's
74 * better than staying here forever.
75 *
76 * We clear next_event so we have a
77 * chance that the box survives.
78 */
79 printk(KERN_WARNING
80 "CE: Reprogramming failure. Giving up\n");
81 dev->next_event.tv64 = KTIME_MAX;
82 return -ETIME;
83 }
84 i = 0;
85 }
86
87 now = ktime_get();
88 expires = ktime_add_ns(now, dev->min_delta_ns);
89 }
90}
91
24/** 92/**
25 * tick_program_event 93 * tick_program_event
26 */ 94 */
@@ -28,7 +96,7 @@ int tick_program_event(ktime_t expires, int force)
28{ 96{
29 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 97 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
30 98
31 return clockevents_program_event(dev, expires, force); 99 return tick_dev_program_event(dev, expires, force);
32} 100}
33 101
34/** 102/**
@@ -36,10 +104,11 @@ int tick_program_event(ktime_t expires, int force)
36 */ 104 */
37void tick_resume_oneshot(void) 105void tick_resume_oneshot(void)
38{ 106{
39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 107 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
108 struct clock_event_device *dev = td->evtdev;
40 109
41 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 110 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
42 clockevents_program_event(dev, ktime_get(), true); 111 tick_program_event(ktime_get(), 1);
43} 112}
44 113
45/** 114/**
@@ -51,7 +120,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
51{ 120{
52 newdev->event_handler = handler; 121 newdev->event_handler = handler;
53 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 122 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
54 clockevents_program_event(newdev, next_event, true); 123 tick_dev_program_event(newdev, next_event, 1);
55} 124}
56 125
57/** 126/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd..d5097c44b40 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 32
33/* 33/*
34 * The time, when the last jiffy update happened. Protected by jiffies_lock. 34 * The time, when the last jiffy update happened. Protected by xtime_lock.
35 */ 35 */
36static ktime_t last_jiffies_update; 36static ktime_t last_jiffies_update;
37 37
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now)
49 ktime_t delta; 49 ktime_t delta;
50 50
51 /* 51 /*
52 * Do a quick check without holding jiffies_lock: 52 * Do a quick check without holding xtime_lock:
53 */ 53 */
54 delta = ktime_sub(now, last_jiffies_update); 54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64) 55 if (delta.tv64 < tick_period.tv64)
56 return; 56 return;
57 57
58 /* Reevalute with jiffies_lock held */ 58 /* Reevalute with xtime_lock held */
59 write_seqlock(&jiffies_lock); 59 write_seqlock(&xtime_lock);
60 60
61 delta = ktime_sub(now, last_jiffies_update); 61 delta = ktime_sub(now, last_jiffies_update);
62 if (delta.tv64 >= tick_period.tv64) { 62 if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now)
79 /* Keep the tick_next_period variable up to date */ 79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period); 80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
81 } 81 }
82 write_sequnlock(&jiffies_lock); 82 write_sequnlock(&xtime_lock);
83} 83}
84 84
85/* 85/*
@@ -89,58 +89,15 @@ static ktime_t tick_init_jiffy_update(void)
89{ 89{
90 ktime_t period; 90 ktime_t period;
91 91
92 write_seqlock(&jiffies_lock); 92 write_seqlock(&xtime_lock);
93 /* Did we start the jiffies update yet ? */ 93 /* Did we start the jiffies update yet ? */
94 if (last_jiffies_update.tv64 == 0) 94 if (last_jiffies_update.tv64 == 0)
95 last_jiffies_update = tick_next_period; 95 last_jiffies_update = tick_next_period;
96 period = last_jiffies_update; 96 period = last_jiffies_update;
97 write_sequnlock(&jiffies_lock); 97 write_sequnlock(&xtime_lock);
98 return period; 98 return period;
99} 99}
100 100
101
102static void tick_sched_do_timer(ktime_t now)
103{
104 int cpu = smp_processor_id();
105
106#ifdef CONFIG_NO_HZ
107 /*
108 * Check if the do_timer duty was dropped. We don't care about
109 * concurrency: This happens only when the cpu in charge went
110 * into a long sleep. If two cpus happen to assign themself to
111 * this duty, then the jiffies update is still serialized by
112 * jiffies_lock.
113 */
114 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
115 tick_do_timer_cpu = cpu;
116#endif
117
118 /* Check, if the jiffies need an update */
119 if (tick_do_timer_cpu == cpu)
120 tick_do_update_jiffies64(now);
121}
122
123static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
124{
125#ifdef CONFIG_NO_HZ
126 /*
127 * When we are idle and the tick is stopped, we have to touch
128 * the watchdog as we might not schedule for a really long
129 * time. This happens on complete idle SMP systems while
130 * waiting on the login prompt. We also increment the "start of
131 * idle" jiffy stamp so the idle accounting adjustment we do
132 * when we go busy again does not account too much ticks.
133 */
134 if (ts->tick_stopped) {
135 touch_softlockup_watchdog();
136 if (is_idle_task(current))
137 ts->idle_jiffies++;
138 }
139#endif
140 update_process_times(user_mode(regs));
141 profile_tick(CPU_PROFILING);
142}
143
144/* 101/*
145 * NOHZ - aka dynamic tick functionality 102 * NOHZ - aka dynamic tick functionality
146 */ 103 */
@@ -148,7 +105,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
148/* 105/*
149 * NO HZ enabled ? 106 * NO HZ enabled ?
150 */ 107 */
151int tick_nohz_enabled __read_mostly = 1; 108static int tick_nohz_enabled __read_mostly = 1;
152 109
153/* 110/*
154 * Enable / Disable tickless mode 111 * Enable / Disable tickless mode
@@ -182,6 +139,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
182 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 139 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
183 unsigned long flags; 140 unsigned long flags;
184 141
142 cpumask_clear_cpu(cpu, nohz_cpu_mask);
185 ts->idle_waketime = now; 143 ts->idle_waketime = now;
186 144
187 local_irq_save(flags); 145 local_irq_save(flags);
@@ -201,10 +159,9 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
201 159
202 if (ts->idle_active) { 160 if (ts->idle_active) {
203 delta = ktime_sub(now, ts->idle_entrytime); 161 delta = ktime_sub(now, ts->idle_entrytime);
162 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
204 if (nr_iowait_cpu(cpu) > 0) 163 if (nr_iowait_cpu(cpu) > 0)
205 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 164 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
206 else
207 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
208 ts->idle_entrytime = now; 165 ts->idle_entrytime = now;
209 } 166 }
210 167
@@ -225,7 +182,11 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
225 182
226static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
227{ 184{
228 ktime_t now = ktime_get(); 185 ktime_t now;
186
187 now = ktime_get();
188
189 update_ts_time_stats(cpu, ts, now, NULL);
229 190
230 ts->idle_entrytime = now; 191 ts->idle_entrytime = now;
231 ts->idle_active = 1; 192 ts->idle_active = 1;
@@ -236,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
236/** 197/**
237 * get_cpu_idle_time_us - get the total idle time of a cpu 198 * get_cpu_idle_time_us - get the total idle time of a cpu
238 * @cpu: CPU number to query 199 * @cpu: CPU number to query
239 * @last_update_time: variable to store update time in. Do not update 200 * @last_update_time: variable to store update time in
240 * counters if NULL.
241 * 201 *
242 * Return the cummulative idle time (since boot) for a given 202 * Return the cummulative idle time (since boot) for a given
243 * CPU, in microseconds. 203 * CPU, in microseconds. The idle time returned includes
204 * the iowait time (unlike what "top" and co report).
244 * 205 *
245 * This time is measured via accounting rather than sampling, 206 * This time is measured via accounting rather than sampling,
246 * and is as accurate as ktime_get() is. 207 * and is as accurate as ktime_get() is.
@@ -250,35 +211,20 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
250u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 211u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
251{ 212{
252 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 213 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
253 ktime_t now, idle;
254 214
255 if (!tick_nohz_enabled) 215 if (!tick_nohz_enabled)
256 return -1; 216 return -1;
257 217
258 now = ktime_get(); 218 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
259 if (last_update_time) {
260 update_ts_time_stats(cpu, ts, now, last_update_time);
261 idle = ts->idle_sleeptime;
262 } else {
263 if (ts->idle_active && !nr_iowait_cpu(cpu)) {
264 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
265
266 idle = ktime_add(ts->idle_sleeptime, delta);
267 } else {
268 idle = ts->idle_sleeptime;
269 }
270 }
271
272 return ktime_to_us(idle);
273 219
220 return ktime_to_us(ts->idle_sleeptime);
274} 221}
275EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 222EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
276 223
277/** 224/*
278 * get_cpu_iowait_time_us - get the total iowait time of a cpu 225 * get_cpu_iowait_time_us - get the total iowait time of a cpu
279 * @cpu: CPU number to query 226 * @cpu: CPU number to query
280 * @last_update_time: variable to store update time in. Do not update 227 * @last_update_time: variable to store update time in
281 * counters if NULL.
282 * 228 *
283 * Return the cummulative iowait time (since boot) for a given 229 * Return the cummulative iowait time (since boot) for a given
284 * CPU, in microseconds. 230 * CPU, in microseconds.
@@ -291,47 +237,93 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
291u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 237u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
292{ 238{
293 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 239 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
294 ktime_t now, iowait;
295 240
296 if (!tick_nohz_enabled) 241 if (!tick_nohz_enabled)
297 return -1; 242 return -1;
298 243
299 now = ktime_get(); 244 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
300 if (last_update_time) {
301 update_ts_time_stats(cpu, ts, now, last_update_time);
302 iowait = ts->iowait_sleeptime;
303 } else {
304 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
305 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
306
307 iowait = ktime_add(ts->iowait_sleeptime, delta);
308 } else {
309 iowait = ts->iowait_sleeptime;
310 }
311 }
312 245
313 return ktime_to_us(iowait); 246 return ktime_to_us(ts->iowait_sleeptime);
314} 247}
315EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 248EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
316 249
317static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 250/**
318 ktime_t now, int cpu) 251 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
252 *
253 * When the next event is more than a tick into the future, stop the idle tick
254 * Called either from the idle loop or from irq_exit() when an idle period was
255 * just interrupted by an interrupt which did not cause a reschedule.
256 */
257void tick_nohz_stop_sched_tick(int inidle)
319{ 258{
320 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 259 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
321 ktime_t last_update, expires, ret = { .tv64 = 0 }; 260 struct tick_sched *ts;
322 unsigned long rcu_delta_jiffies; 261 ktime_t last_update, expires, now;
323 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 262 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
324 u64 time_delta; 263 u64 time_delta;
264 int cpu;
265
266 local_irq_save(flags);
267
268 cpu = smp_processor_id();
269 ts = &per_cpu(tick_cpu_sched, cpu);
270
271 /*
272 * Call to tick_nohz_start_idle stops the last_update_time from being
273 * updated. Thus, it must not be called in the event we are called from
274 * irq_exit() with the prior state different than idle.
275 */
276 if (!inidle && !ts->inidle)
277 goto end;
278
279 /*
280 * Set ts->inidle unconditionally. Even if the system did not
281 * switch to NOHZ mode the cpu frequency governers rely on the
282 * update of the idle time accounting in tick_nohz_start_idle().
283 */
284 ts->inidle = 1;
325 285
286 now = tick_nohz_start_idle(cpu, ts);
287
288 /*
289 * If this cpu is offline and it is the one which updates
290 * jiffies, then give up the assignment and let it be taken by
291 * the cpu which runs the tick timer next. If we don't drop
292 * this here the jiffies might be stale and do_timer() never
293 * invoked.
294 */
295 if (unlikely(!cpu_online(cpu))) {
296 if (cpu == tick_do_timer_cpu)
297 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
298 }
299
300 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
301 goto end;
302
303 if (need_resched())
304 goto end;
305
306 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
307 static int ratelimit;
308
309 if (ratelimit < 10) {
310 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
311 (unsigned int) local_softirq_pending());
312 ratelimit++;
313 }
314 goto end;
315 }
316
317 ts->idle_calls++;
326 /* Read jiffies and the time when jiffies were updated last */ 318 /* Read jiffies and the time when jiffies were updated last */
327 do { 319 do {
328 seq = read_seqbegin(&jiffies_lock); 320 seq = read_seqbegin(&xtime_lock);
329 last_update = last_jiffies_update; 321 last_update = last_jiffies_update;
330 last_jiffies = jiffies; 322 last_jiffies = jiffies;
331 time_delta = timekeeping_max_deferment(); 323 time_delta = timekeeping_max_deferment();
332 } while (read_seqretry(&jiffies_lock, seq)); 324 } while (read_seqretry(&xtime_lock, seq));
333 325
334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 326 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
335 arch_needs_cpu(cpu)) { 327 arch_needs_cpu(cpu)) {
336 next_jiffies = last_jiffies + 1; 328 next_jiffies = last_jiffies + 1;
337 delta_jiffies = 1; 329 delta_jiffies = 1;
@@ -339,10 +331,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
339 /* Get the next timer wheel timer */ 331 /* Get the next timer wheel timer */
340 next_jiffies = get_next_timer_interrupt(last_jiffies); 332 next_jiffies = get_next_timer_interrupt(last_jiffies);
341 delta_jiffies = next_jiffies - last_jiffies; 333 delta_jiffies = next_jiffies - last_jiffies;
342 if (rcu_delta_jiffies < delta_jiffies) {
343 next_jiffies = last_jiffies + rcu_delta_jiffies;
344 delta_jiffies = rcu_delta_jiffies;
345 }
346 } 334 }
347 /* 335 /*
348 * Do not stop the tick, if we are only one off 336 * Do not stop the tick, if we are only one off
@@ -401,12 +389,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
401 else 389 else
402 expires.tv64 = KTIME_MAX; 390 expires.tv64 = KTIME_MAX;
403 391
392 if (delta_jiffies > 1)
393 cpumask_set_cpu(cpu, nohz_cpu_mask);
394
404 /* Skip reprogram of event if its not changed */ 395 /* Skip reprogram of event if its not changed */
405 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 396 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
406 goto out; 397 goto out;
407 398
408 ret = expires;
409
410 /* 399 /*
411 * nohz_stop_sched_tick can be called several times before 400 * nohz_stop_sched_tick can be called several times before
412 * the nohz_restart_sched_tick is called. This happens when 401 * the nohz_restart_sched_tick is called. This happens when
@@ -415,13 +404,19 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
415 * the scheduler tick in nohz_restart_sched_tick. 404 * the scheduler tick in nohz_restart_sched_tick.
416 */ 405 */
417 if (!ts->tick_stopped) { 406 if (!ts->tick_stopped) {
418 nohz_balance_enter_idle(cpu); 407 select_nohz_load_balancer(1);
419 calc_load_enter_idle();
420 408
421 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
422 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
411 ts->idle_jiffies = last_jiffies;
412 rcu_enter_nohz();
423 } 413 }
424 414
415 ts->idle_sleeps++;
416
417 /* Mark expires */
418 ts->idle_expires = expires;
419
425 /* 420 /*
426 * If the expiration time == KTIME_MAX, then 421 * If the expiration time == KTIME_MAX, then
427 * in this case we simply stop the tick timer. 422 * in this case we simply stop the tick timer.
@@ -446,132 +441,15 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
446 * softirq. 441 * softirq.
447 */ 442 */
448 tick_do_update_jiffies64(ktime_get()); 443 tick_do_update_jiffies64(ktime_get());
444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
449 } 445 }
450 raise_softirq_irqoff(TIMER_SOFTIRQ); 446 raise_softirq_irqoff(TIMER_SOFTIRQ);
451out: 447out:
452 ts->next_jiffies = next_jiffies; 448 ts->next_jiffies = next_jiffies;
453 ts->last_jiffies = last_jiffies; 449 ts->last_jiffies = last_jiffies;
454 ts->sleep_length = ktime_sub(dev->next_event, now); 450 ts->sleep_length = ktime_sub(dev->next_event, now);
455 451end:
456 return ret; 452 local_irq_restore(flags);
457}
458
459static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
460{
461 /*
462 * If this cpu is offline and it is the one which updates
463 * jiffies, then give up the assignment and let it be taken by
464 * the cpu which runs the tick timer next. If we don't drop
465 * this here the jiffies might be stale and do_timer() never
466 * invoked.
467 */
468 if (unlikely(!cpu_online(cpu))) {
469 if (cpu == tick_do_timer_cpu)
470 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
471 }
472
473 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
474 return false;
475
476 if (need_resched())
477 return false;
478
479 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
480 static int ratelimit;
481
482 if (ratelimit < 10 &&
483 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
484 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
485 (unsigned int) local_softirq_pending());
486 ratelimit++;
487 }
488 return false;
489 }
490
491 return true;
492}
493
494static void __tick_nohz_idle_enter(struct tick_sched *ts)
495{
496 ktime_t now, expires;
497 int cpu = smp_processor_id();
498
499 now = tick_nohz_start_idle(cpu, ts);
500
501 if (can_stop_idle_tick(cpu, ts)) {
502 int was_stopped = ts->tick_stopped;
503
504 ts->idle_calls++;
505
506 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
507 if (expires.tv64 > 0LL) {
508 ts->idle_sleeps++;
509 ts->idle_expires = expires;
510 }
511
512 if (!was_stopped && ts->tick_stopped)
513 ts->idle_jiffies = ts->last_jiffies;
514 }
515}
516
517/**
518 * tick_nohz_idle_enter - stop the idle tick from the idle task
519 *
520 * When the next event is more than a tick into the future, stop the idle tick
521 * Called when we start the idle loop.
522 *
523 * The arch is responsible of calling:
524 *
525 * - rcu_idle_enter() after its last use of RCU before the CPU is put
526 * to sleep.
527 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
528 */
529void tick_nohz_idle_enter(void)
530{
531 struct tick_sched *ts;
532
533 WARN_ON_ONCE(irqs_disabled());
534
535 /*
536 * Update the idle state in the scheduler domain hierarchy
537 * when tick_nohz_stop_sched_tick() is called from the idle loop.
538 * State will be updated to busy during the first busy tick after
539 * exiting idle.
540 */
541 set_cpu_sd_state_idle();
542
543 local_irq_disable();
544
545 ts = &__get_cpu_var(tick_cpu_sched);
546 /*
547 * set ts->inidle unconditionally. even if the system did not
548 * switch to nohz mode the cpu frequency governers rely on the
549 * update of the idle time accounting in tick_nohz_start_idle().
550 */
551 ts->inidle = 1;
552 __tick_nohz_idle_enter(ts);
553
554 local_irq_enable();
555}
556
557/**
558 * tick_nohz_irq_exit - update next tick event from interrupt exit
559 *
560 * When an interrupt fires while we are idle and it doesn't cause
561 * a reschedule, it may still add, modify or delete a timer, enqueue
562 * an RCU callback, etc...
563 * So we need to re-calculate and reprogram the next tick event.
564 */
565void tick_nohz_irq_exit(void)
566{
567 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
568
569 if (!ts->inidle)
570 return;
571
572 /* Cancel the timer because CPU already waken up from the C-states*/
573 menu_hrtimer_cancel();
574 __tick_nohz_idle_enter(ts);
575} 453}
576 454
577/** 455/**
@@ -589,7 +467,7 @@ ktime_t tick_nohz_get_sleep_length(void)
589static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 467static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
590{ 468{
591 hrtimer_cancel(&ts->sched_timer); 469 hrtimer_cancel(&ts->sched_timer);
592 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 470 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
593 471
594 while (1) { 472 while (1) {
595 /* Forward the time to expire in the future */ 473 /* Forward the time to expire in the future */
@@ -606,33 +484,49 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
606 hrtimer_get_expires(&ts->sched_timer), 0)) 484 hrtimer_get_expires(&ts->sched_timer), 0))
607 break; 485 break;
608 } 486 }
609 /* Reread time and update jiffies */ 487 /* Update jiffies and reread time */
610 now = ktime_get();
611 tick_do_update_jiffies64(now); 488 tick_do_update_jiffies64(now);
489 now = ktime_get();
612 } 490 }
613} 491}
614 492
615static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 493/**
494 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
495 *
496 * Restart the idle tick when the CPU is woken up from idle
497 */
498void tick_nohz_restart_sched_tick(void)
616{ 499{
617 /* Update jiffies first */ 500 int cpu = smp_processor_id();
618 tick_do_update_jiffies64(now); 501 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
619 update_cpu_load_nohz(); 502#ifndef CONFIG_VIRT_CPU_ACCOUNTING
503 unsigned long ticks;
504#endif
505 ktime_t now;
620 506
621 calc_load_exit_idle(); 507 local_irq_disable();
622 touch_softlockup_watchdog(); 508 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
623 /* 509 now = ktime_get();
624 * Cancel the scheduled timer and restore the tick
625 */
626 ts->tick_stopped = 0;
627 ts->idle_exittime = now;
628 510
629 tick_nohz_restart(ts, now); 511 if (ts->idle_active)
630} 512 tick_nohz_stop_idle(cpu, now);
513
514 if (!ts->inidle || !ts->tick_stopped) {
515 ts->inidle = 0;
516 local_irq_enable();
517 return;
518 }
519
520 ts->inidle = 0;
521
522 rcu_exit_nohz();
523
524 /* Update jiffies first */
525 select_nohz_load_balancer(0);
526 tick_do_update_jiffies64(now);
527 cpumask_clear_cpu(cpu, nohz_cpu_mask);
631 528
632static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
633{
634#ifndef CONFIG_VIRT_CPU_ACCOUNTING 529#ifndef CONFIG_VIRT_CPU_ACCOUNTING
635 unsigned long ticks;
636 /* 530 /*
637 * We stopped the tick in idle. Update process times would miss the 531 * We stopped the tick in idle. Update process times would miss the
638 * time we slept as update_process_times does only a 1 tick 532 * time we slept as update_process_times does only a 1 tick
@@ -645,39 +539,15 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
645 if (ticks && ticks < LONG_MAX) 539 if (ticks && ticks < LONG_MAX)
646 account_idle_ticks(ticks); 540 account_idle_ticks(ticks);
647#endif 541#endif
648}
649 542
650/** 543 touch_softlockup_watchdog();
651 * tick_nohz_idle_exit - restart the idle tick from the idle task 544 /*
652 * 545 * Cancel the scheduled timer and restore the tick
653 * Restart the idle tick when the CPU is woken up from idle 546 */
654 * This also exit the RCU extended quiescent state. The CPU 547 ts->tick_stopped = 0;
655 * can use RCU again after this function is called. 548 ts->idle_exittime = now;
656 */
657void tick_nohz_idle_exit(void)
658{
659 int cpu = smp_processor_id();
660 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
661 ktime_t now;
662
663 local_irq_disable();
664
665 WARN_ON_ONCE(!ts->inidle);
666
667 ts->inidle = 0;
668
669 /* Cancel the timer because CPU already waken up from the C-states*/
670 menu_hrtimer_cancel();
671 if (ts->idle_active || ts->tick_stopped)
672 now = ktime_get();
673
674 if (ts->idle_active)
675 tick_nohz_stop_idle(cpu, now);
676 549
677 if (ts->tick_stopped) { 550 tick_nohz_restart(ts, now);
678 tick_nohz_restart_sched_tick(ts, now);
679 tick_nohz_account_idle_ticks(ts);
680 }
681 551
682 local_irq_enable(); 552 local_irq_enable();
683} 553}
@@ -695,12 +565,40 @@ static void tick_nohz_handler(struct clock_event_device *dev)
695{ 565{
696 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 566 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
697 struct pt_regs *regs = get_irq_regs(); 567 struct pt_regs *regs = get_irq_regs();
568 int cpu = smp_processor_id();
698 ktime_t now = ktime_get(); 569 ktime_t now = ktime_get();
699 570
700 dev->next_event.tv64 = KTIME_MAX; 571 dev->next_event.tv64 = KTIME_MAX;
701 572
702 tick_sched_do_timer(now); 573 /*
703 tick_sched_handle(ts, regs); 574 * Check if the do_timer duty was dropped. We don't care about
575 * concurrency: This happens only when the cpu in charge went
576 * into a long sleep. If two cpus happen to assign themself to
577 * this duty, then the jiffies update is still serialized by
578 * xtime_lock.
579 */
580 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
581 tick_do_timer_cpu = cpu;
582
583 /* Check, if the jiffies need an update */
584 if (tick_do_timer_cpu == cpu)
585 tick_do_update_jiffies64(now);
586
587 /*
588 * When we are idle and the tick is stopped, we have to touch
589 * the watchdog as we might not schedule for a really long
590 * time. This happens on complete idle SMP systems while
591 * waiting on the login prompt. We also increment the "start
592 * of idle" jiffy stamp so the idle accounting adjustment we
593 * do when we go busy again does not account too much ticks.
594 */
595 if (ts->tick_stopped) {
596 touch_softlockup_watchdog();
597 ts->idle_jiffies++;
598 }
599
600 update_process_times(user_mode(regs));
601 profile_tick(CPU_PROFILING);
704 602
705 while (tick_nohz_reprogram(ts, now)) { 603 while (tick_nohz_reprogram(ts, now)) {
706 now = ktime_get(); 604 now = ktime_get();
@@ -742,6 +640,8 @@ static void tick_nohz_switch_to_nohz(void)
742 next = ktime_add(next, tick_period); 640 next = ktime_add(next, tick_period);
743 } 641 }
744 local_irq_enable(); 642 local_irq_enable();
643
644 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
745} 645}
746 646
747/* 647/*
@@ -813,7 +713,7 @@ void tick_check_idle(int cpu)
813#ifdef CONFIG_HIGH_RES_TIMERS 713#ifdef CONFIG_HIGH_RES_TIMERS
814/* 714/*
815 * We rearm the timer until we get disabled by the idle code. 715 * We rearm the timer until we get disabled by the idle code.
816 * Called with interrupts disabled. 716 * Called with interrupts disabled and timer->base->cpu_base->lock held.
817 */ 717 */
818static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 718static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
819{ 719{
@@ -821,31 +721,50 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
821 container_of(timer, struct tick_sched, sched_timer); 721 container_of(timer, struct tick_sched, sched_timer);
822 struct pt_regs *regs = get_irq_regs(); 722 struct pt_regs *regs = get_irq_regs();
823 ktime_t now = ktime_get(); 723 ktime_t now = ktime_get();
724 int cpu = smp_processor_id();
824 725
825 tick_sched_do_timer(now); 726#ifdef CONFIG_NO_HZ
727 /*
728 * Check if the do_timer duty was dropped. We don't care about
729 * concurrency: This happens only when the cpu in charge went
730 * into a long sleep. If two cpus happen to assign themself to
731 * this duty, then the jiffies update is still serialized by
732 * xtime_lock.
733 */
734 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
735 tick_do_timer_cpu = cpu;
736#endif
737
738 /* Check, if the jiffies need an update */
739 if (tick_do_timer_cpu == cpu)
740 tick_do_update_jiffies64(now);
826 741
827 /* 742 /*
828 * Do not call, when we are not in irq context and have 743 * Do not call, when we are not in irq context and have
829 * no valid regs pointer 744 * no valid regs pointer
830 */ 745 */
831 if (regs) 746 if (regs) {
832 tick_sched_handle(ts, regs); 747 /*
748 * When we are idle and the tick is stopped, we have to touch
749 * the watchdog as we might not schedule for a really long
750 * time. This happens on complete idle SMP systems while
751 * waiting on the login prompt. We also increment the "start of
752 * idle" jiffy stamp so the idle accounting adjustment we do
753 * when we go busy again does not account too much ticks.
754 */
755 if (ts->tick_stopped) {
756 touch_softlockup_watchdog();
757 ts->idle_jiffies++;
758 }
759 update_process_times(user_mode(regs));
760 profile_tick(CPU_PROFILING);
761 }
833 762
834 hrtimer_forward(timer, now, tick_period); 763 hrtimer_forward(timer, now, tick_period);
835 764
836 return HRTIMER_RESTART; 765 return HRTIMER_RESTART;
837} 766}
838 767
839static int sched_skew_tick;
840
841static int __init skew_tick(char *str)
842{
843 get_option(&str, &sched_skew_tick);
844
845 return 0;
846}
847early_param("skew_tick", skew_tick);
848
849/** 768/**
850 * tick_setup_sched_timer - setup the tick emulation timer 769 * tick_setup_sched_timer - setup the tick emulation timer
851 */ 770 */
@@ -863,14 +782,6 @@ void tick_setup_sched_timer(void)
863 /* Get the next period (per cpu) */ 782 /* Get the next period (per cpu) */
864 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 783 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
865 784
866 /* Offset the tick to avert jiffies_lock contention. */
867 if (sched_skew_tick) {
868 u64 offset = ktime_to_ns(tick_period) >> 1;
869 do_div(offset, num_possible_cpus());
870 offset *= smp_processor_id();
871 hrtimer_add_expires_ns(&ts->sched_timer, offset);
872 }
873
874 for (;;) { 785 for (;;) {
875 hrtimer_forward(&ts->sched_timer, now, tick_period); 786 hrtimer_forward(&ts->sched_timer, now, tick_period);
876 hrtimer_start_expires(&ts->sched_timer, 787 hrtimer_start_expires(&ts->sched_timer,
@@ -882,8 +793,10 @@ void tick_setup_sched_timer(void)
882 } 793 }
883 794
884#ifdef CONFIG_NO_HZ 795#ifdef CONFIG_NO_HZ
885 if (tick_nohz_enabled) 796 if (tick_nohz_enabled) {
886 ts->nohz_mode = NOHZ_MODE_HIGHRES; 797 ts->nohz_mode = NOHZ_MODE_HIGHRES;
798 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
799 }
887#endif 800#endif
888} 801}
889#endif /* HIGH_RES_TIMERS */ 802#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3..6f9798bf240 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,7 +8,6 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/timekeeper_internal.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/percpu.h> 13#include <linux/percpu.h>
@@ -21,60 +20,37 @@
21#include <linux/time.h> 20#include <linux/time.h>
22#include <linux/tick.h> 21#include <linux/tick.h>
23#include <linux/stop_machine.h> 22#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
25 23
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* The shift value of the current clocksource. */
29 int shift;
30
31 /* Number of clock cycles in one NTP interval. */
32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
37 /* Raw nano seconds accumulated per NTP interval. */
38 u32 raw_interval;
39
40 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
41 u64 xtime_nsec;
42 /* Difference between accumulated time and NTP time in ntp
43 * shifted nano seconds. */
44 s64 ntp_error;
45 /* Shift conversion between clock shifted nano seconds and
46 * ntp shifted nano seconds. */
47 int ntp_error_shift;
48 /* NTP adjusted clock multiplier */
49 u32 mult;
50};
26 51
27static struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
28 53
29/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended;
31
32static inline void tk_normalize_xtime(struct timekeeper *tk)
33{
34 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
35 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
36 tk->xtime_sec++;
37 }
38}
39
40static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
41{
42 tk->xtime_sec = ts->tv_sec;
43 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
44}
45
46static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
47{
48 tk->xtime_sec += ts->tv_sec;
49 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
50 tk_normalize_xtime(tk);
51}
52
53static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
54{
55 struct timespec tmp;
56
57 /*
58 * Verify consistency of: offset_real = -wall_to_monotonic
59 * before modifying anything
60 */
61 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
62 -tk->wall_to_monotonic.tv_nsec);
63 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
64 tk->wall_to_monotonic = wtm;
65 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
66 tk->offs_real = timespec_to_ktime(tmp);
67}
68
69static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
70{
71 /* Verify consistency before modifying */
72 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
73
74 tk->total_sleep_time = t;
75 tk->offs_boot = timespec_to_ktime(t);
76}
77
78/** 54/**
79 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
80 * 56 *
@@ -85,14 +61,12 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
85 * 61 *
86 * Unless you're the timekeeping code, you should not be using this! 62 * Unless you're the timekeeping code, you should not be using this!
87 */ 63 */
88static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
89{ 65{
90 cycle_t interval; 66 cycle_t interval;
91 u64 tmp, ntpinterval; 67 u64 tmp, ntpinterval;
92 struct clocksource *old_clock;
93 68
94 old_clock = tk->clock; 69 timekeeper.clock = clock;
95 tk->clock = clock;
96 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
97 71
98 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
@@ -105,133 +79,103 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
105 tmp = 1; 79 tmp = 1;
106 80
107 interval = (cycle_t) tmp; 81 interval = (cycle_t) tmp;
108 tk->cycle_interval = interval; 82 timekeeper.cycle_interval = interval;
109 83
110 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
111 tk->xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
112 tk->xtime_remainder = ntpinterval - tk->xtime_interval; 86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
113 tk->raw_interval = 87 timekeeper.raw_interval =
114 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
115 89
116 /* if changing clocks, convert xtime_nsec shift units */ 90 timekeeper.xtime_nsec = 0;
117 if (old_clock) { 91 timekeeper.shift = clock->shift;
118 int shift_change = clock->shift - old_clock->shift;
119 if (shift_change < 0)
120 tk->xtime_nsec >>= -shift_change;
121 else
122 tk->xtime_nsec <<= shift_change;
123 }
124 tk->shift = clock->shift;
125 92
126 tk->ntp_error = 0; 93 timekeeper.ntp_error = 0;
127 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 94 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
128 95
129 /* 96 /*
130 * The timekeeper keeps its own mult values for the currently 97 * The timekeeper keeps its own mult values for the currently
131 * active clocksource. These value will be adjusted via NTP 98 * active clocksource. These value will be adjusted via NTP
132 * to counteract clock drifting. 99 * to counteract clock drifting.
133 */ 100 */
134 tk->mult = clock->mult; 101 timekeeper.mult = clock->mult;
135} 102}
136 103
137/* Timekeeper helper functions. */ 104/* Timekeeper helper functions. */
138static inline s64 timekeeping_get_ns(struct timekeeper *tk) 105static inline s64 timekeeping_get_ns(void)
139{ 106{
140 cycle_t cycle_now, cycle_delta; 107 cycle_t cycle_now, cycle_delta;
141 struct clocksource *clock; 108 struct clocksource *clock;
142 s64 nsec;
143 109
144 /* read clocksource: */ 110 /* read clocksource: */
145 clock = tk->clock; 111 clock = timekeeper.clock;
146 cycle_now = clock->read(clock); 112 cycle_now = clock->read(clock);
147 113
148 /* calculate the delta since the last update_wall_time: */ 114 /* calculate the delta since the last update_wall_time: */
149 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 115 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
150 116
151 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 117 /* return delta convert to nanoseconds using ntp adjusted mult. */
152 nsec >>= tk->shift; 118 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
153 119 timekeeper.shift);
154 /* If arch requires, add in gettimeoffset() */
155 return nsec + arch_gettimeoffset();
156} 120}
157 121
158static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 122static inline s64 timekeeping_get_ns_raw(void)
159{ 123{
160 cycle_t cycle_now, cycle_delta; 124 cycle_t cycle_now, cycle_delta;
161 struct clocksource *clock; 125 struct clocksource *clock;
162 s64 nsec;
163 126
164 /* read clocksource: */ 127 /* read clocksource: */
165 clock = tk->clock; 128 clock = timekeeper.clock;
166 cycle_now = clock->read(clock); 129 cycle_now = clock->read(clock);
167 130
168 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
169 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
170 133
171 /* convert delta to nanoseconds. */ 134 /* return delta convert to nanoseconds using ntp adjusted mult. */
172 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
173
174 /* If arch requires, add in gettimeoffset() */
175 return nsec + arch_gettimeoffset();
176}
177
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
179
180static void update_pvclock_gtod(struct timekeeper *tk)
181{
182 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
183} 136}
184 137
185/** 138/*
186 * pvclock_gtod_register_notifier - register a pvclock timedata update listener 139 * This read-write spinlock protects us from races in SMP while
187 * 140 * playing with xtime.
188 * Must hold write on timekeeper.lock
189 */ 141 */
190int pvclock_gtod_register_notifier(struct notifier_block *nb) 142__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
191{
192 struct timekeeper *tk = &timekeeper;
193 unsigned long flags;
194 int ret;
195
196 write_seqlock_irqsave(&tk->lock, flags);
197 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
198 /* update timekeeping data */
199 update_pvclock_gtod(tk);
200 write_sequnlock_irqrestore(&tk->lock, flags);
201 143
202 return ret;
203}
204EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
205 144
206/** 145/*
207 * pvclock_gtod_unregister_notifier - unregister a pvclock 146 * The current time
208 * timedata update listener 147 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
148 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
149 * at zero at system boot time, so wall_to_monotonic will be negative,
150 * however, we will ALWAYS keep the tv_nsec part positive so we can use
151 * the usual normalization.
209 * 152 *
210 * Must hold write on timekeeper.lock 153 * wall_to_monotonic is moved after resume from suspend for the monotonic
154 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
155 * to get the real boot based time offset.
156 *
157 * - wall_to_monotonic is no longer the boot time, getboottime must be
158 * used instead.
211 */ 159 */
212int pvclock_gtod_unregister_notifier(struct notifier_block *nb) 160static struct timespec xtime __attribute__ ((aligned (16)));
213{ 161static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
214 struct timekeeper *tk = &timekeeper; 162static struct timespec total_sleep_time;
215 unsigned long flags;
216 int ret;
217 163
218 write_seqlock_irqsave(&tk->lock, flags); 164/*
219 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
220 write_sequnlock_irqrestore(&tk->lock, flags); 166 */
167static struct timespec raw_time;
221 168
222 return ret; 169/* flag for if timekeeping is suspended */
223} 170int __read_mostly timekeeping_suspended;
224EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
225 171
226/* must hold write on timekeeper.lock */ 172/* must hold xtime_lock */
227static void timekeeping_update(struct timekeeper *tk, bool clearntp) 173void timekeeping_leap_insert(int leapsecond)
228{ 174{
229 if (clearntp) { 175 xtime.tv_sec += leapsecond;
230 tk->ntp_error = 0; 176 wall_to_monotonic.tv_sec -= leapsecond;
231 ntp_clear(); 177 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
232 } 178 timekeeper.mult);
233 update_vsyscall(tk);
234 update_pvclock_gtod(tk);
235} 179}
236 180
237/** 181/**
@@ -241,26 +185,27 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
241 * update_wall_time(). This is useful before significant clock changes, 185 * update_wall_time(). This is useful before significant clock changes,
242 * as it avoids having to deal with this time offset explicitly. 186 * as it avoids having to deal with this time offset explicitly.
243 */ 187 */
244static void timekeeping_forward_now(struct timekeeper *tk) 188static void timekeeping_forward_now(void)
245{ 189{
246 cycle_t cycle_now, cycle_delta; 190 cycle_t cycle_now, cycle_delta;
247 struct clocksource *clock; 191 struct clocksource *clock;
248 s64 nsec; 192 s64 nsec;
249 193
250 clock = tk->clock; 194 clock = timekeeper.clock;
251 cycle_now = clock->read(clock); 195 cycle_now = clock->read(clock);
252 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 196 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
253 clock->cycle_last = cycle_now; 197 clock->cycle_last = cycle_now;
254 198
255 tk->xtime_nsec += cycle_delta * tk->mult; 199 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
200 timekeeper.shift);
256 201
257 /* If arch requires, add in gettimeoffset() */ 202 /* If arch requires, add in gettimeoffset() */
258 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; 203 nsec += arch_gettimeoffset();
259 204
260 tk_normalize_xtime(tk); 205 timespec_add_ns(&xtime, nsec);
261 206
262 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 207 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
263 timespec_add_ns(&tk->raw_time, nsec); 208 timespec_add_ns(&raw_time, nsec);
264} 209}
265 210
266/** 211/**
@@ -271,39 +216,43 @@ static void timekeeping_forward_now(struct timekeeper *tk)
271 */ 216 */
272void getnstimeofday(struct timespec *ts) 217void getnstimeofday(struct timespec *ts)
273{ 218{
274 struct timekeeper *tk = &timekeeper;
275 unsigned long seq; 219 unsigned long seq;
276 s64 nsecs = 0; 220 s64 nsecs;
277 221
278 WARN_ON(timekeeping_suspended); 222 WARN_ON(timekeeping_suspended);
279 223
280 do { 224 do {
281 seq = read_seqbegin(&tk->lock); 225 seq = read_seqbegin(&xtime_lock);
282 226
283 ts->tv_sec = tk->xtime_sec; 227 *ts = xtime;
284 nsecs = timekeeping_get_ns(tk); 228 nsecs = timekeeping_get_ns();
285 229
286 } while (read_seqretry(&tk->lock, seq)); 230 /* If arch requires, add in gettimeoffset() */
231 nsecs += arch_gettimeoffset();
232
233 } while (read_seqretry(&xtime_lock, seq));
287 234
288 ts->tv_nsec = 0;
289 timespec_add_ns(ts, nsecs); 235 timespec_add_ns(ts, nsecs);
290} 236}
237
291EXPORT_SYMBOL(getnstimeofday); 238EXPORT_SYMBOL(getnstimeofday);
292 239
293ktime_t ktime_get(void) 240ktime_t ktime_get(void)
294{ 241{
295 struct timekeeper *tk = &timekeeper;
296 unsigned int seq; 242 unsigned int seq;
297 s64 secs, nsecs; 243 s64 secs, nsecs;
298 244
299 WARN_ON(timekeeping_suspended); 245 WARN_ON(timekeeping_suspended);
300 246
301 do { 247 do {
302 seq = read_seqbegin(&tk->lock); 248 seq = read_seqbegin(&xtime_lock);
303 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
304 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
305 251 nsecs += timekeeping_get_ns();
306 } while (read_seqretry(&tk->lock, seq)); 252 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset();
254
255 } while (read_seqretry(&xtime_lock, seq));
307 /* 256 /*
308 * Use ktime_set/ktime_add_ns to create a proper ktime on 257 * Use ktime_set/ktime_add_ns to create a proper ktime on
309 * 32-bit architectures without CONFIG_KTIME_SCALAR. 258 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -322,24 +271,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
322 */ 271 */
323void ktime_get_ts(struct timespec *ts) 272void ktime_get_ts(struct timespec *ts)
324{ 273{
325 struct timekeeper *tk = &timekeeper;
326 struct timespec tomono; 274 struct timespec tomono;
327 s64 nsec;
328 unsigned int seq; 275 unsigned int seq;
276 s64 nsecs;
329 277
330 WARN_ON(timekeeping_suspended); 278 WARN_ON(timekeeping_suspended);
331 279
332 do { 280 do {
333 seq = read_seqbegin(&tk->lock); 281 seq = read_seqbegin(&xtime_lock);
334 ts->tv_sec = tk->xtime_sec; 282 *ts = xtime;
335 nsec = timekeeping_get_ns(tk); 283 tomono = wall_to_monotonic;
336 tomono = tk->wall_to_monotonic; 284 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset();
337 287
338 } while (read_seqretry(&tk->lock, seq)); 288 } while (read_seqretry(&xtime_lock, seq));
339 289
340 ts->tv_sec += tomono.tv_sec; 290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
341 ts->tv_nsec = 0; 291 ts->tv_nsec + tomono.tv_nsec + nsecs);
342 timespec_add_ns(ts, nsec + tomono.tv_nsec);
343} 292}
344EXPORT_SYMBOL_GPL(ktime_get_ts); 293EXPORT_SYMBOL_GPL(ktime_get_ts);
345 294
@@ -356,23 +305,28 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
356 */ 305 */
357void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 306void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
358{ 307{
359 struct timekeeper *tk = &timekeeper;
360 unsigned long seq; 308 unsigned long seq;
361 s64 nsecs_raw, nsecs_real; 309 s64 nsecs_raw, nsecs_real;
362 310
363 WARN_ON_ONCE(timekeeping_suspended); 311 WARN_ON_ONCE(timekeeping_suspended);
364 312
365 do { 313 do {
366 seq = read_seqbegin(&tk->lock); 314 u32 arch_offset;
367 315
368 *ts_raw = tk->raw_time; 316 seq = read_seqbegin(&xtime_lock);
369 ts_real->tv_sec = tk->xtime_sec;
370 ts_real->tv_nsec = 0;
371 317
372 nsecs_raw = timekeeping_get_ns_raw(tk); 318 *ts_raw = raw_time;
373 nsecs_real = timekeeping_get_ns(tk); 319 *ts_real = xtime;
374 320
375 } while (read_seqretry(&tk->lock, seq)); 321 nsecs_raw = timekeeping_get_ns_raw();
322 nsecs_real = timekeeping_get_ns();
323
324 /* If arch requires, add in gettimeoffset() */
325 arch_offset = arch_gettimeoffset();
326 nsecs_raw += arch_offset;
327 nsecs_real += arch_offset;
328
329 } while (read_seqretry(&xtime_lock, seq));
376 330
377 timespec_add_ns(ts_raw, nsecs_raw); 331 timespec_add_ns(ts_raw, nsecs_raw);
378 timespec_add_ns(ts_real, nsecs_real); 332 timespec_add_ns(ts_real, nsecs_real);
@@ -395,8 +349,8 @@ void do_gettimeofday(struct timeval *tv)
395 tv->tv_sec = now.tv_sec; 349 tv->tv_sec = now.tv_sec;
396 tv->tv_usec = now.tv_nsec/1000; 350 tv->tv_usec = now.tv_nsec/1000;
397} 351}
398EXPORT_SYMBOL(do_gettimeofday);
399 352
353EXPORT_SYMBOL(do_gettimeofday);
400/** 354/**
401 * do_settimeofday - Sets the time of day 355 * do_settimeofday - Sets the time of day
402 * @tv: pointer to the timespec variable containing the new time 356 * @tv: pointer to the timespec variable containing the new time
@@ -405,36 +359,39 @@ EXPORT_SYMBOL(do_gettimeofday);
405 */ 359 */
406int do_settimeofday(const struct timespec *tv) 360int do_settimeofday(const struct timespec *tv)
407{ 361{
408 struct timekeeper *tk = &timekeeper; 362 struct timespec ts_delta;
409 struct timespec ts_delta, xt;
410 unsigned long flags; 363 unsigned long flags;
411 364
412 if (!timespec_valid_strict(tv)) 365 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
413 return -EINVAL; 366 return -EINVAL;
414 367
415 write_seqlock_irqsave(&tk->lock, flags); 368 write_seqlock_irqsave(&xtime_lock, flags);
416 369
417 timekeeping_forward_now(tk); 370 timekeeping_forward_now();
418 371
419 xt = tk_xtime(tk); 372 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 373 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 374 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
422 375
423 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); 376 xtime = *tv;
424 377
425 tk_set_xtime(tk, tv); 378 timekeeper.ntp_error = 0;
379 ntp_clear();
426 380
427 timekeeping_update(tk, true); 381 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
382 timekeeper.mult);
428 383
429 write_sequnlock_irqrestore(&tk->lock, flags); 384 write_sequnlock_irqrestore(&xtime_lock, flags);
430 385
431 /* signal hrtimers about time change */ 386 /* signal hrtimers about time change */
432 clock_was_set(); 387 clock_was_set();
433 388
434 return 0; 389 return 0;
435} 390}
391
436EXPORT_SYMBOL(do_settimeofday); 392EXPORT_SYMBOL(do_settimeofday);
437 393
394
438/** 395/**
439 * timekeeping_inject_offset - Adds or subtracts from the current time. 396 * timekeeping_inject_offset - Adds or subtracts from the current time.
440 * @tv: pointer to the timespec variable containing the offset 397 * @tv: pointer to the timespec variable containing the offset
@@ -443,37 +400,30 @@ EXPORT_SYMBOL(do_settimeofday);
443 */ 400 */
444int timekeeping_inject_offset(struct timespec *ts) 401int timekeeping_inject_offset(struct timespec *ts)
445{ 402{
446 struct timekeeper *tk = &timekeeper;
447 unsigned long flags; 403 unsigned long flags;
448 struct timespec tmp;
449 int ret = 0;
450 404
451 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 405 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
452 return -EINVAL; 406 return -EINVAL;
453 407
454 write_seqlock_irqsave(&tk->lock, flags); 408 write_seqlock_irqsave(&xtime_lock, flags);
455 409
456 timekeeping_forward_now(tk); 410 timekeeping_forward_now();
457 411
458 /* Make sure the proposed value is valid */ 412 xtime = timespec_add(xtime, *ts);
459 tmp = timespec_add(tk_xtime(tk), *ts); 413 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
460 if (!timespec_valid_strict(&tmp)) {
461 ret = -EINVAL;
462 goto error;
463 }
464 414
465 tk_xtime_add(tk, ts); 415 timekeeper.ntp_error = 0;
466 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 416 ntp_clear();
467 417
468error: /* even if we error out, we forwarded the time, so call update */ 418 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
469 timekeeping_update(tk, true); 419 timekeeper.mult);
470 420
471 write_sequnlock_irqrestore(&tk->lock, flags); 421 write_sequnlock_irqrestore(&xtime_lock, flags);
472 422
473 /* signal hrtimers about time change */ 423 /* signal hrtimers about time change */
474 clock_was_set(); 424 clock_was_set();
475 425
476 return ret; 426 return 0;
477} 427}
478EXPORT_SYMBOL(timekeeping_inject_offset); 428EXPORT_SYMBOL(timekeeping_inject_offset);
479 429
@@ -484,25 +434,17 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
484 */ 434 */
485static int change_clocksource(void *data) 435static int change_clocksource(void *data)
486{ 436{
487 struct timekeeper *tk = &timekeeper;
488 struct clocksource *new, *old; 437 struct clocksource *new, *old;
489 unsigned long flags;
490 438
491 new = (struct clocksource *) data; 439 new = (struct clocksource *) data;
492 440
493 write_seqlock_irqsave(&tk->lock, flags); 441 timekeeping_forward_now();
494
495 timekeeping_forward_now(tk);
496 if (!new->enable || new->enable(new) == 0) { 442 if (!new->enable || new->enable(new) == 0) {
497 old = tk->clock; 443 old = timekeeper.clock;
498 tk_setup_internals(tk, new); 444 timekeeper_setup_internals(new);
499 if (old->disable) 445 if (old->disable)
500 old->disable(old); 446 old->disable(old);
501 } 447 }
502 timekeeping_update(tk, true);
503
504 write_sequnlock_irqrestore(&tk->lock, flags);
505
506 return 0; 448 return 0;
507} 449}
508 450
@@ -515,9 +457,7 @@ static int change_clocksource(void *data)
515 */ 457 */
516void timekeeping_notify(struct clocksource *clock) 458void timekeeping_notify(struct clocksource *clock)
517{ 459{
518 struct timekeeper *tk = &timekeeper; 460 if (timekeeper.clock == clock)
519
520 if (tk->clock == clock)
521 return; 461 return;
522 stop_machine(change_clocksource, clock, NULL); 462 stop_machine(change_clocksource, clock, NULL);
523 tick_clock_notify(); 463 tick_clock_notify();
@@ -546,57 +486,48 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
546 */ 486 */
547void getrawmonotonic(struct timespec *ts) 487void getrawmonotonic(struct timespec *ts)
548{ 488{
549 struct timekeeper *tk = &timekeeper;
550 unsigned long seq; 489 unsigned long seq;
551 s64 nsecs; 490 s64 nsecs;
552 491
553 do { 492 do {
554 seq = read_seqbegin(&tk->lock); 493 seq = read_seqbegin(&xtime_lock);
555 nsecs = timekeeping_get_ns_raw(tk); 494 nsecs = timekeeping_get_ns_raw();
556 *ts = tk->raw_time; 495 *ts = raw_time;
557 496
558 } while (read_seqretry(&tk->lock, seq)); 497 } while (read_seqretry(&xtime_lock, seq));
559 498
560 timespec_add_ns(ts, nsecs); 499 timespec_add_ns(ts, nsecs);
561} 500}
562EXPORT_SYMBOL(getrawmonotonic); 501EXPORT_SYMBOL(getrawmonotonic);
563 502
503
564/** 504/**
565 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 505 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
566 */ 506 */
567int timekeeping_valid_for_hres(void) 507int timekeeping_valid_for_hres(void)
568{ 508{
569 struct timekeeper *tk = &timekeeper;
570 unsigned long seq; 509 unsigned long seq;
571 int ret; 510 int ret;
572 511
573 do { 512 do {
574 seq = read_seqbegin(&tk->lock); 513 seq = read_seqbegin(&xtime_lock);
575 514
576 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 515 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
577 516
578 } while (read_seqretry(&tk->lock, seq)); 517 } while (read_seqretry(&xtime_lock, seq));
579 518
580 return ret; 519 return ret;
581} 520}
582 521
583/** 522/**
584 * timekeeping_max_deferment - Returns max time the clocksource can be deferred 523 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
524 *
525 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
526 * ensure that the clocksource does not change!
585 */ 527 */
586u64 timekeeping_max_deferment(void) 528u64 timekeeping_max_deferment(void)
587{ 529{
588 struct timekeeper *tk = &timekeeper; 530 return timekeeper.clock->max_idle_ns;
589 unsigned long seq;
590 u64 ret;
591
592 do {
593 seq = read_seqbegin(&tk->lock);
594
595 ret = tk->clock->max_idle_ns;
596
597 } while (read_seqretry(&tk->lock, seq));
598
599 return ret;
600} 531}
601 532
602/** 533/**
@@ -634,51 +565,35 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
634 */ 565 */
635void __init timekeeping_init(void) 566void __init timekeeping_init(void)
636{ 567{
637 struct timekeeper *tk = &timekeeper;
638 struct clocksource *clock; 568 struct clocksource *clock;
639 unsigned long flags; 569 unsigned long flags;
640 struct timespec now, boot, tmp; 570 struct timespec now, boot;
641 571
642 read_persistent_clock(&now); 572 read_persistent_clock(&now);
643 if (!timespec_valid_strict(&now)) {
644 pr_warn("WARNING: Persistent clock returned invalid value!\n"
645 " Check your CMOS/BIOS settings.\n");
646 now.tv_sec = 0;
647 now.tv_nsec = 0;
648 }
649
650 read_boot_clock(&boot); 573 read_boot_clock(&boot);
651 if (!timespec_valid_strict(&boot)) {
652 pr_warn("WARNING: Boot clock returned invalid value!\n"
653 " Check your CMOS/BIOS settings.\n");
654 boot.tv_sec = 0;
655 boot.tv_nsec = 0;
656 }
657 574
658 seqlock_init(&tk->lock); 575 write_seqlock_irqsave(&xtime_lock, flags);
659 576
660 ntp_init(); 577 ntp_init();
661 578
662 write_seqlock_irqsave(&tk->lock, flags);
663 clock = clocksource_default_clock(); 579 clock = clocksource_default_clock();
664 if (clock->enable) 580 if (clock->enable)
665 clock->enable(clock); 581 clock->enable(clock);
666 tk_setup_internals(tk, clock); 582 timekeeper_setup_internals(clock);
667 583
668 tk_set_xtime(tk, &now); 584 xtime.tv_sec = now.tv_sec;
669 tk->raw_time.tv_sec = 0; 585 xtime.tv_nsec = now.tv_nsec;
670 tk->raw_time.tv_nsec = 0; 586 raw_time.tv_sec = 0;
671 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 587 raw_time.tv_nsec = 0;
672 boot = tk_xtime(tk); 588 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
673 589 boot.tv_sec = xtime.tv_sec;
674 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); 590 boot.tv_nsec = xtime.tv_nsec;
675 tk_set_wall_to_mono(tk, tmp); 591 }
676 592 set_normalized_timespec(&wall_to_monotonic,
677 tmp.tv_sec = 0; 593 -boot.tv_sec, -boot.tv_nsec);
678 tmp.tv_nsec = 0; 594 total_sleep_time.tv_sec = 0;
679 tk_set_sleep_time(tk, tmp); 595 total_sleep_time.tv_nsec = 0;
680 596 write_sequnlock_irqrestore(&xtime_lock, flags);
681 write_sequnlock_irqrestore(&tk->lock, flags);
682} 597}
683 598
684/* time in seconds when suspend began */ 599/* time in seconds when suspend began */
@@ -691,19 +606,20 @@ static struct timespec timekeeping_suspend_time;
691 * Takes a timespec offset measuring a suspend interval and properly 606 * Takes a timespec offset measuring a suspend interval and properly
692 * adds the sleep offset to the timekeeping variables. 607 * adds the sleep offset to the timekeeping variables.
693 */ 608 */
694static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 609static void __timekeeping_inject_sleeptime(struct timespec *delta)
695 struct timespec *delta)
696{ 610{
697 if (!timespec_valid_strict(delta)) { 611 if (!timespec_valid(delta)) {
698 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 612 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
699 "sleep delta value!\n"); 613 "sleep delta value!\n");
700 return; 614 return;
701 } 615 }
702 tk_xtime_add(tk, delta); 616
703 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 617 xtime = timespec_add(xtime, *delta);
704 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 618 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
619 total_sleep_time = timespec_add(total_sleep_time, *delta);
705} 620}
706 621
622
707/** 623/**
708 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 624 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
709 * @delta: pointer to a timespec delta value 625 * @delta: pointer to a timespec delta value
@@ -716,7 +632,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
716 */ 632 */
717void timekeeping_inject_sleeptime(struct timespec *delta) 633void timekeeping_inject_sleeptime(struct timespec *delta)
718{ 634{
719 struct timekeeper *tk = &timekeeper;
720 unsigned long flags; 635 unsigned long flags;
721 struct timespec ts; 636 struct timespec ts;
722 637
@@ -725,20 +640,23 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
725 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 640 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
726 return; 641 return;
727 642
728 write_seqlock_irqsave(&tk->lock, flags); 643 write_seqlock_irqsave(&xtime_lock, flags);
729 644 timekeeping_forward_now();
730 timekeeping_forward_now(tk);
731 645
732 __timekeeping_inject_sleeptime(tk, delta); 646 __timekeeping_inject_sleeptime(delta);
733 647
734 timekeeping_update(tk, true); 648 timekeeper.ntp_error = 0;
649 ntp_clear();
650 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
651 timekeeper.mult);
735 652
736 write_sequnlock_irqrestore(&tk->lock, flags); 653 write_sequnlock_irqrestore(&xtime_lock, flags);
737 654
738 /* signal hrtimers about time change */ 655 /* signal hrtimers about time change */
739 clock_was_set(); 656 clock_was_set();
740} 657}
741 658
659
742/** 660/**
743 * timekeeping_resume - Resumes the generic timekeeping subsystem. 661 * timekeeping_resume - Resumes the generic timekeeping subsystem.
744 * 662 *
@@ -748,27 +666,24 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
748 */ 666 */
749static void timekeeping_resume(void) 667static void timekeeping_resume(void)
750{ 668{
751 struct timekeeper *tk = &timekeeper;
752 unsigned long flags; 669 unsigned long flags;
753 struct timespec ts; 670 struct timespec ts;
754 671
755 read_persistent_clock(&ts); 672 read_persistent_clock(&ts);
756 673
757 clockevents_resume();
758 clocksource_resume(); 674 clocksource_resume();
759 675
760 write_seqlock_irqsave(&tk->lock, flags); 676 write_seqlock_irqsave(&xtime_lock, flags);
761 677
762 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 678 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
763 ts = timespec_sub(ts, timekeeping_suspend_time); 679 ts = timespec_sub(ts, timekeeping_suspend_time);
764 __timekeeping_inject_sleeptime(tk, &ts); 680 __timekeeping_inject_sleeptime(&ts);
765 } 681 }
766 /* re-base the last cycle value */ 682 /* re-base the last cycle value */
767 tk->clock->cycle_last = tk->clock->read(tk->clock); 683 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
768 tk->ntp_error = 0; 684 timekeeper.ntp_error = 0;
769 timekeeping_suspended = 0; 685 timekeeping_suspended = 0;
770 timekeeping_update(tk, false); 686 write_sequnlock_irqrestore(&xtime_lock, flags);
771 write_sequnlock_irqrestore(&tk->lock, flags);
772 687
773 touch_softlockup_watchdog(); 688 touch_softlockup_watchdog();
774 689
@@ -780,15 +695,14 @@ static void timekeeping_resume(void)
780 695
781static int timekeeping_suspend(void) 696static int timekeeping_suspend(void)
782{ 697{
783 struct timekeeper *tk = &timekeeper;
784 unsigned long flags; 698 unsigned long flags;
785 struct timespec delta, delta_delta; 699 struct timespec delta, delta_delta;
786 static struct timespec old_delta; 700 static struct timespec old_delta;
787 701
788 read_persistent_clock(&timekeeping_suspend_time); 702 read_persistent_clock(&timekeeping_suspend_time);
789 703
790 write_seqlock_irqsave(&tk->lock, flags); 704 write_seqlock_irqsave(&xtime_lock, flags);
791 timekeeping_forward_now(tk); 705 timekeeping_forward_now();
792 timekeeping_suspended = 1; 706 timekeeping_suspended = 1;
793 707
794 /* 708 /*
@@ -797,7 +711,7 @@ static int timekeeping_suspend(void)
797 * try to compensate so the difference in system time 711 * try to compensate so the difference in system time
798 * and persistent_clock time stays close to constant. 712 * and persistent_clock time stays close to constant.
799 */ 713 */
800 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); 714 delta = timespec_sub(xtime, timekeeping_suspend_time);
801 delta_delta = timespec_sub(delta, old_delta); 715 delta_delta = timespec_sub(delta, old_delta);
802 if (abs(delta_delta.tv_sec) >= 2) { 716 if (abs(delta_delta.tv_sec) >= 2) {
803 /* 717 /*
@@ -810,11 +724,10 @@ static int timekeeping_suspend(void)
810 timekeeping_suspend_time = 724 timekeeping_suspend_time =
811 timespec_add(timekeeping_suspend_time, delta_delta); 725 timespec_add(timekeeping_suspend_time, delta_delta);
812 } 726 }
813 write_sequnlock_irqrestore(&tk->lock, flags); 727 write_sequnlock_irqrestore(&xtime_lock, flags);
814 728
815 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 729 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
816 clocksource_suspend(); 730 clocksource_suspend();
817 clockevents_suspend();
818 731
819 return 0; 732 return 0;
820} 733}
@@ -837,8 +750,7 @@ device_initcall(timekeeping_init_ops);
837 * If the error is already larger, we look ahead even further 750 * If the error is already larger, we look ahead even further
838 * to compensate for late or lost adjustments. 751 * to compensate for late or lost adjustments.
839 */ 752 */
840static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, 753static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
841 s64 error, s64 *interval,
842 s64 *offset) 754 s64 *offset)
843{ 755{
844 s64 tick_error, i; 756 s64 tick_error, i;
@@ -854,7 +766,7 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
854 * here. This is tuned so that an error of about 1 msec is adjusted 766 * here. This is tuned so that an error of about 1 msec is adjusted
855 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 767 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
856 */ 768 */
857 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 769 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
858 error2 = abs(error2); 770 error2 = abs(error2);
859 for (look_ahead = 0; error2 > 0; look_ahead++) 771 for (look_ahead = 0; error2 > 0; look_ahead++)
860 error2 >>= 2; 772 error2 >>= 2;
@@ -863,8 +775,8 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
863 * Now calculate the error in (1 << look_ahead) ticks, but first 775 * Now calculate the error in (1 << look_ahead) ticks, but first
864 * remove the single look ahead already included in the error. 776 * remove the single look ahead already included in the error.
865 */ 777 */
866 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); 778 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
867 tick_error -= tk->xtime_interval >> 1; 779 tick_error -= timekeeper.xtime_interval >> 1;
868 error = ((error - tick_error) >> look_ahead) + tick_error; 780 error = ((error - tick_error) >> look_ahead) + tick_error;
869 781
870 /* Finally calculate the adjustment shift value. */ 782 /* Finally calculate the adjustment shift value. */
@@ -889,181 +801,43 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
889 * this is optimized for the most common adjustments of -1,0,1, 801 * this is optimized for the most common adjustments of -1,0,1,
890 * for other values we can do a bit more work. 802 * for other values we can do a bit more work.
891 */ 803 */
892static void timekeeping_adjust(struct timekeeper *tk, s64 offset) 804static void timekeeping_adjust(s64 offset)
893{ 805{
894 s64 error, interval = tk->cycle_interval; 806 s64 error, interval = timekeeper.cycle_interval;
895 int adj; 807 int adj;
896 808
897 /* 809 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
898 * The point of this is to check if the error is greater than half
899 * an interval.
900 *
901 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
902 *
903 * Note we subtract one in the shift, so that error is really error*2.
904 * This "saves" dividing(shifting) interval twice, but keeps the
905 * (error > interval) comparison as still measuring if error is
906 * larger than half an interval.
907 *
908 * Note: It does not "save" on aggravation when reading the code.
909 */
910 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
911 if (error > interval) { 810 if (error > interval) {
912 /*
913 * We now divide error by 4(via shift), which checks if
914 * the error is greater than twice the interval.
915 * If it is greater, we need a bigadjust, if its smaller,
916 * we can adjust by 1.
917 */
918 error >>= 2; 811 error >>= 2;
919 /*
920 * XXX - In update_wall_time, we round up to the next
921 * nanosecond, and store the amount rounded up into
922 * the error. This causes the likely below to be unlikely.
923 *
924 * The proper fix is to avoid rounding up by using
925 * the high precision tk->xtime_nsec instead of
926 * xtime.tv_nsec everywhere. Fixing this will take some
927 * time.
928 */
929 if (likely(error <= interval)) 812 if (likely(error <= interval))
930 adj = 1; 813 adj = 1;
931 else 814 else
932 adj = timekeeping_bigadjust(tk, error, &interval, &offset); 815 adj = timekeeping_bigadjust(error, &interval, &offset);
933 } else { 816 } else if (error < -interval) {
934 if (error < -interval) { 817 error >>= 2;
935 /* See comment above, this is just switched for the negative */ 818 if (likely(error >= -interval)) {
936 error >>= 2; 819 adj = -1;
937 if (likely(error >= -interval)) { 820 interval = -interval;
938 adj = -1; 821 offset = -offset;
939 interval = -interval; 822 } else
940 offset = -offset; 823 adj = timekeeping_bigadjust(error, &interval, &offset);
941 } else { 824 } else
942 adj = timekeeping_bigadjust(tk, error, &interval, &offset); 825 return;
943 }
944 } else {
945 goto out_adjust;
946 }
947 }
948
949 if (unlikely(tk->clock->maxadj &&
950 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
951 printk_once(KERN_WARNING
952 "Adjusting %s more than 11%% (%ld vs %ld)\n",
953 tk->clock->name, (long)tk->mult + adj,
954 (long)tk->clock->mult + tk->clock->maxadj);
955 }
956 /*
957 * So the following can be confusing.
958 *
959 * To keep things simple, lets assume adj == 1 for now.
960 *
961 * When adj != 1, remember that the interval and offset values
962 * have been appropriately scaled so the math is the same.
963 *
964 * The basic idea here is that we're increasing the multiplier
965 * by one, this causes the xtime_interval to be incremented by
966 * one cycle_interval. This is because:
967 * xtime_interval = cycle_interval * mult
968 * So if mult is being incremented by one:
969 * xtime_interval = cycle_interval * (mult + 1)
970 * Its the same as:
971 * xtime_interval = (cycle_interval * mult) + cycle_interval
972 * Which can be shortened to:
973 * xtime_interval += cycle_interval
974 *
975 * So offset stores the non-accumulated cycles. Thus the current
976 * time (in shifted nanoseconds) is:
977 * now = (offset * adj) + xtime_nsec
978 * Now, even though we're adjusting the clock frequency, we have
979 * to keep time consistent. In other words, we can't jump back
980 * in time, and we also want to avoid jumping forward in time.
981 *
982 * So given the same offset value, we need the time to be the same
983 * both before and after the freq adjustment.
984 * now = (offset * adj_1) + xtime_nsec_1
985 * now = (offset * adj_2) + xtime_nsec_2
986 * So:
987 * (offset * adj_1) + xtime_nsec_1 =
988 * (offset * adj_2) + xtime_nsec_2
989 * And we know:
990 * adj_2 = adj_1 + 1
991 * So:
992 * (offset * adj_1) + xtime_nsec_1 =
993 * (offset * (adj_1+1)) + xtime_nsec_2
994 * (offset * adj_1) + xtime_nsec_1 =
995 * (offset * adj_1) + offset + xtime_nsec_2
996 * Canceling the sides:
997 * xtime_nsec_1 = offset + xtime_nsec_2
998 * Which gives us:
999 * xtime_nsec_2 = xtime_nsec_1 - offset
1000 * Which simplfies to:
1001 * xtime_nsec -= offset
1002 *
1003 * XXX - TODO: Doc ntp_error calculation.
1004 */
1005 tk->mult += adj;
1006 tk->xtime_interval += interval;
1007 tk->xtime_nsec -= offset;
1008 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1009
1010out_adjust:
1011 /*
1012 * It may be possible that when we entered this function, xtime_nsec
1013 * was very small. Further, if we're slightly speeding the clocksource
1014 * in the code above, its possible the required corrective factor to
1015 * xtime_nsec could cause it to underflow.
1016 *
1017 * Now, since we already accumulated the second, cannot simply roll
1018 * the accumulated second back, since the NTP subsystem has been
1019 * notified via second_overflow. So instead we push xtime_nsec forward
1020 * by the amount we underflowed, and add that amount into the error.
1021 *
1022 * We'll correct this error next time through this function, when
1023 * xtime_nsec is not as small.
1024 */
1025 if (unlikely((s64)tk->xtime_nsec < 0)) {
1026 s64 neg = -(s64)tk->xtime_nsec;
1027 tk->xtime_nsec = 0;
1028 tk->ntp_error += neg << tk->ntp_error_shift;
1029 }
1030 826
827 WARN_ONCE(timekeeper.clock->maxadj &&
828 (timekeeper.mult + adj > timekeeper.clock->mult +
829 timekeeper.clock->maxadj),
830 "Adjusting %s more then 11%% (%ld vs %ld)\n",
831 timekeeper.clock->name, (long)timekeeper.mult + adj,
832 (long)timekeeper.clock->mult +
833 timekeeper.clock->maxadj);
834 timekeeper.mult += adj;
835 timekeeper.xtime_interval += interval;
836 timekeeper.xtime_nsec -= offset;
837 timekeeper.ntp_error -= (interval - offset) <<
838 timekeeper.ntp_error_shift;
1031} 839}
1032 840
1033/**
1034 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1035 *
1036 * Helper function that accumulates a the nsecs greater then a second
1037 * from the xtime_nsec field to the xtime_secs field.
1038 * It also calls into the NTP code to handle leapsecond processing.
1039 *
1040 */
1041static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1042{
1043 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1044
1045 while (tk->xtime_nsec >= nsecps) {
1046 int leap;
1047
1048 tk->xtime_nsec -= nsecps;
1049 tk->xtime_sec++;
1050
1051 /* Figure out if its a leap sec and apply if needed */
1052 leap = second_overflow(tk->xtime_sec);
1053 if (unlikely(leap)) {
1054 struct timespec ts;
1055
1056 tk->xtime_sec += leap;
1057
1058 ts.tv_sec = leap;
1059 ts.tv_nsec = 0;
1060 tk_set_wall_to_mono(tk,
1061 timespec_sub(tk->wall_to_monotonic, ts));
1062
1063 clock_was_set_delayed();
1064 }
1065 }
1066}
1067 841
1068/** 842/**
1069 * logarithmic_accumulation - shifted accumulation of cycles 843 * logarithmic_accumulation - shifted accumulation of cycles
@@ -1074,136 +848,137 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1074 * 848 *
1075 * Returns the unconsumed cycles. 849 * Returns the unconsumed cycles.
1076 */ 850 */
1077static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 851static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
1078 u32 shift)
1079{ 852{
853 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
1080 u64 raw_nsecs; 854 u64 raw_nsecs;
1081 855
1082 /* If the offset is smaller then a shifted interval, do nothing */ 856 /* If the offset is smaller then a shifted interval, do nothing */
1083 if (offset < tk->cycle_interval<<shift) 857 if (offset < timekeeper.cycle_interval<<shift)
1084 return offset; 858 return offset;
1085 859
1086 /* Accumulate one shifted interval */ 860 /* Accumulate one shifted interval */
1087 offset -= tk->cycle_interval << shift; 861 offset -= timekeeper.cycle_interval << shift;
1088 tk->clock->cycle_last += tk->cycle_interval << shift; 862 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
1089 863
1090 tk->xtime_nsec += tk->xtime_interval << shift; 864 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
1091 accumulate_nsecs_to_secs(tk); 865 while (timekeeper.xtime_nsec >= nsecps) {
866 timekeeper.xtime_nsec -= nsecps;
867 xtime.tv_sec++;
868 second_overflow();
869 }
1092 870
1093 /* Accumulate raw time */ 871 /* Accumulate raw time */
1094 raw_nsecs = (u64)tk->raw_interval << shift; 872 raw_nsecs = timekeeper.raw_interval << shift;
1095 raw_nsecs += tk->raw_time.tv_nsec; 873 raw_nsecs += raw_time.tv_nsec;
1096 if (raw_nsecs >= NSEC_PER_SEC) { 874 if (raw_nsecs >= NSEC_PER_SEC) {
1097 u64 raw_secs = raw_nsecs; 875 u64 raw_secs = raw_nsecs;
1098 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 876 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
1099 tk->raw_time.tv_sec += raw_secs; 877 raw_time.tv_sec += raw_secs;
1100 } 878 }
1101 tk->raw_time.tv_nsec = raw_nsecs; 879 raw_time.tv_nsec = raw_nsecs;
1102 880
1103 /* Accumulate error between NTP and clock interval */ 881 /* Accumulate error between NTP and clock interval */
1104 tk->ntp_error += ntp_tick_length() << shift; 882 timekeeper.ntp_error += tick_length << shift;
1105 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << 883 timekeeper.ntp_error -=
1106 (tk->ntp_error_shift + shift); 884 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
885 (timekeeper.ntp_error_shift + shift);
1107 886
1108 return offset; 887 return offset;
1109} 888}
1110 889
1111#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1112static inline void old_vsyscall_fixup(struct timekeeper *tk)
1113{
1114 s64 remainder;
1115
1116 /*
1117 * Store only full nanoseconds into xtime_nsec after rounding
1118 * it up and add the remainder to the error difference.
1119 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1120 * by truncating the remainder in vsyscalls. However, it causes
1121 * additional work to be done in timekeeping_adjust(). Once
1122 * the vsyscall implementations are converted to use xtime_nsec
1123 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1124 * users are removed, this can be killed.
1125 */
1126 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1127 tk->xtime_nsec -= remainder;
1128 tk->xtime_nsec += 1ULL << tk->shift;
1129 tk->ntp_error += remainder << tk->ntp_error_shift;
1130
1131}
1132#else
1133#define old_vsyscall_fixup(tk)
1134#endif
1135
1136
1137 890
1138/** 891/**
1139 * update_wall_time - Uses the current clocksource to increment the wall time 892 * update_wall_time - Uses the current clocksource to increment the wall time
1140 * 893 *
894 * Called from the timer interrupt, must hold a write on xtime_lock.
1141 */ 895 */
1142static void update_wall_time(void) 896static void update_wall_time(void)
1143{ 897{
1144 struct clocksource *clock; 898 struct clocksource *clock;
1145 struct timekeeper *tk = &timekeeper;
1146 cycle_t offset; 899 cycle_t offset;
1147 int shift = 0, maxshift; 900 int shift = 0, maxshift;
1148 unsigned long flags;
1149
1150 write_seqlock_irqsave(&tk->lock, flags);
1151 901
1152 /* Make sure we're fully resumed: */ 902 /* Make sure we're fully resumed: */
1153 if (unlikely(timekeeping_suspended)) 903 if (unlikely(timekeeping_suspended))
1154 goto out; 904 return;
1155 905
1156 clock = tk->clock; 906 clock = timekeeper.clock;
1157 907
1158#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 908#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1159 offset = tk->cycle_interval; 909 offset = timekeeper.cycle_interval;
1160#else 910#else
1161 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 911 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1162#endif 912#endif
1163 913 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
1164 /* Check if there's really nothing to do */
1165 if (offset < tk->cycle_interval)
1166 goto out;
1167 914
1168 /* 915 /*
1169 * With NO_HZ we may have to accumulate many cycle_intervals 916 * With NO_HZ we may have to accumulate many cycle_intervals
1170 * (think "ticks") worth of time at once. To do this efficiently, 917 * (think "ticks") worth of time at once. To do this efficiently,
1171 * we calculate the largest doubling multiple of cycle_intervals 918 * we calculate the largest doubling multiple of cycle_intervals
1172 * that is smaller than the offset. We then accumulate that 919 * that is smaller then the offset. We then accumulate that
1173 * chunk in one go, and then try to consume the next smaller 920 * chunk in one go, and then try to consume the next smaller
1174 * doubled multiple. 921 * doubled multiple.
1175 */ 922 */
1176 shift = ilog2(offset) - ilog2(tk->cycle_interval); 923 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
1177 shift = max(0, shift); 924 shift = max(0, shift);
1178 /* Bound shift to one less than what overflows tick_length */ 925 /* Bound shift to one less then what overflows tick_length */
1179 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 926 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
1180 shift = min(shift, maxshift); 927 shift = min(shift, maxshift);
1181 while (offset >= tk->cycle_interval) { 928 while (offset >= timekeeper.cycle_interval) {
1182 offset = logarithmic_accumulation(tk, offset, shift); 929 offset = logarithmic_accumulation(offset, shift);
1183 if (offset < tk->cycle_interval<<shift) 930 if(offset < timekeeper.cycle_interval<<shift)
1184 shift--; 931 shift--;
1185 } 932 }
1186 933
1187 /* correct the clock when NTP error is too big */ 934 /* correct the clock when NTP error is too big */
1188 timekeeping_adjust(tk, offset); 935 timekeeping_adjust(offset);
1189 936
1190 /* 937 /*
1191 * XXX This can be killed once everyone converts 938 * Since in the loop above, we accumulate any amount of time
1192 * to the new update_vsyscall. 939 * in xtime_nsec over a second into xtime.tv_sec, its possible for
940 * xtime_nsec to be fairly small after the loop. Further, if we're
941 * slightly speeding the clocksource up in timekeeping_adjust(),
942 * its possible the required corrective factor to xtime_nsec could
943 * cause it to underflow.
944 *
945 * Now, we cannot simply roll the accumulated second back, since
946 * the NTP subsystem has been notified via second_overflow. So
947 * instead we push xtime_nsec forward by the amount we underflowed,
948 * and add that amount into the error.
949 *
950 * We'll correct this error next time through this function, when
951 * xtime_nsec is not as small.
1193 */ 952 */
1194 old_vsyscall_fixup(tk); 953 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
954 s64 neg = -(s64)timekeeper.xtime_nsec;
955 timekeeper.xtime_nsec = 0;
956 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
957 }
958
1195 959
1196 /* 960 /*
1197 * Finally, make sure that after the rounding 961 * Store full nanoseconds into xtime after rounding it up and
1198 * xtime_nsec isn't larger than NSEC_PER_SEC 962 * add the remainder to the error difference.
1199 */ 963 */
1200 accumulate_nsecs_to_secs(tk); 964 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
965 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
966 timekeeper.ntp_error += timekeeper.xtime_nsec <<
967 timekeeper.ntp_error_shift;
1201 968
1202 timekeeping_update(tk, false); 969 /*
1203 970 * Finally, make sure that after the rounding
1204out: 971 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
1205 write_sequnlock_irqrestore(&tk->lock, flags); 972 */
973 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
974 xtime.tv_nsec -= NSEC_PER_SEC;
975 xtime.tv_sec++;
976 second_overflow();
977 }
1206 978
979 /* check to see if there is a new clocksource to use */
980 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
981 timekeeper.mult);
1207} 982}
1208 983
1209/** 984/**
@@ -1219,18 +994,16 @@ out:
1219 */ 994 */
1220void getboottime(struct timespec *ts) 995void getboottime(struct timespec *ts)
1221{ 996{
1222 struct timekeeper *tk = &timekeeper;
1223 struct timespec boottime = { 997 struct timespec boottime = {
1224 .tv_sec = tk->wall_to_monotonic.tv_sec + 998 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
1225 tk->total_sleep_time.tv_sec, 999 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
1226 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1227 tk->total_sleep_time.tv_nsec
1228 }; 1000 };
1229 1001
1230 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1002 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1231} 1003}
1232EXPORT_SYMBOL_GPL(getboottime); 1004EXPORT_SYMBOL_GPL(getboottime);
1233 1005
1006
1234/** 1007/**
1235 * get_monotonic_boottime - Returns monotonic time since boot 1008 * get_monotonic_boottime - Returns monotonic time since boot
1236 * @ts: pointer to the timespec to be set 1009 * @ts: pointer to the timespec to be set
@@ -1242,25 +1015,23 @@ EXPORT_SYMBOL_GPL(getboottime);
1242 */ 1015 */
1243void get_monotonic_boottime(struct timespec *ts) 1016void get_monotonic_boottime(struct timespec *ts)
1244{ 1017{
1245 struct timekeeper *tk = &timekeeper;
1246 struct timespec tomono, sleep; 1018 struct timespec tomono, sleep;
1247 s64 nsec;
1248 unsigned int seq; 1019 unsigned int seq;
1020 s64 nsecs;
1249 1021
1250 WARN_ON(timekeeping_suspended); 1022 WARN_ON(timekeeping_suspended);
1251 1023
1252 do { 1024 do {
1253 seq = read_seqbegin(&tk->lock); 1025 seq = read_seqbegin(&xtime_lock);
1254 ts->tv_sec = tk->xtime_sec; 1026 *ts = xtime;
1255 nsec = timekeeping_get_ns(tk); 1027 tomono = wall_to_monotonic;
1256 tomono = tk->wall_to_monotonic; 1028 sleep = total_sleep_time;
1257 sleep = tk->total_sleep_time; 1029 nsecs = timekeeping_get_ns();
1258 1030
1259 } while (read_seqretry(&tk->lock, seq)); 1031 } while (read_seqretry(&xtime_lock, seq));
1260 1032
1261 ts->tv_sec += tomono.tv_sec + sleep.tv_sec; 1033 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1262 ts->tv_nsec = 0; 1034 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
1263 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1264} 1035}
1265EXPORT_SYMBOL_GPL(get_monotonic_boottime); 1036EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1266 1037
@@ -1287,38 +1058,31 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1287 */ 1058 */
1288void monotonic_to_bootbased(struct timespec *ts) 1059void monotonic_to_bootbased(struct timespec *ts)
1289{ 1060{
1290 struct timekeeper *tk = &timekeeper; 1061 *ts = timespec_add(*ts, total_sleep_time);
1291
1292 *ts = timespec_add(*ts, tk->total_sleep_time);
1293} 1062}
1294EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1063EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1295 1064
1296unsigned long get_seconds(void) 1065unsigned long get_seconds(void)
1297{ 1066{
1298 struct timekeeper *tk = &timekeeper; 1067 return xtime.tv_sec;
1299
1300 return tk->xtime_sec;
1301} 1068}
1302EXPORT_SYMBOL(get_seconds); 1069EXPORT_SYMBOL(get_seconds);
1303 1070
1304struct timespec __current_kernel_time(void) 1071struct timespec __current_kernel_time(void)
1305{ 1072{
1306 struct timekeeper *tk = &timekeeper; 1073 return xtime;
1307
1308 return tk_xtime(tk);
1309} 1074}
1310 1075
1311struct timespec current_kernel_time(void) 1076struct timespec current_kernel_time(void)
1312{ 1077{
1313 struct timekeeper *tk = &timekeeper;
1314 struct timespec now; 1078 struct timespec now;
1315 unsigned long seq; 1079 unsigned long seq;
1316 1080
1317 do { 1081 do {
1318 seq = read_seqbegin(&tk->lock); 1082 seq = read_seqbegin(&xtime_lock);
1319 1083
1320 now = tk_xtime(tk); 1084 now = xtime;
1321 } while (read_seqretry(&tk->lock, seq)); 1085 } while (read_seqretry(&xtime_lock, seq));
1322 1086
1323 return now; 1087 return now;
1324} 1088}
@@ -1326,16 +1090,15 @@ EXPORT_SYMBOL(current_kernel_time);
1326 1090
1327struct timespec get_monotonic_coarse(void) 1091struct timespec get_monotonic_coarse(void)
1328{ 1092{
1329 struct timekeeper *tk = &timekeeper;
1330 struct timespec now, mono; 1093 struct timespec now, mono;
1331 unsigned long seq; 1094 unsigned long seq;
1332 1095
1333 do { 1096 do {
1334 seq = read_seqbegin(&tk->lock); 1097 seq = read_seqbegin(&xtime_lock);
1335 1098
1336 now = tk_xtime(tk); 1099 now = xtime;
1337 mono = tk->wall_to_monotonic; 1100 mono = wall_to_monotonic;
1338 } while (read_seqretry(&tk->lock, seq)); 1101 } while (read_seqretry(&xtime_lock, seq));
1339 1102
1340 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1103 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1341 now.tv_nsec + mono.tv_nsec); 1104 now.tv_nsec + mono.tv_nsec);
@@ -1343,7 +1106,9 @@ struct timespec get_monotonic_coarse(void)
1343} 1106}
1344 1107
1345/* 1108/*
1346 * Must hold jiffies_lock 1109 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1110 * without sampling the sequence number in xtime_lock.
1111 * jiffies is defined in the linker script...
1347 */ 1112 */
1348void do_timer(unsigned long ticks) 1113void do_timer(unsigned long ticks)
1349{ 1114{
@@ -1362,66 +1127,30 @@ void do_timer(unsigned long ticks)
1362void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1127void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1363 struct timespec *wtom, struct timespec *sleep) 1128 struct timespec *wtom, struct timespec *sleep)
1364{ 1129{
1365 struct timekeeper *tk = &timekeeper;
1366 unsigned long seq; 1130 unsigned long seq;
1367 1131
1368 do { 1132 do {
1369 seq = read_seqbegin(&tk->lock); 1133 seq = read_seqbegin(&xtime_lock);
1370 *xtim = tk_xtime(tk); 1134 *xtim = xtime;
1371 *wtom = tk->wall_to_monotonic; 1135 *wtom = wall_to_monotonic;
1372 *sleep = tk->total_sleep_time; 1136 *sleep = total_sleep_time;
1373 } while (read_seqretry(&tk->lock, seq)); 1137 } while (read_seqretry(&xtime_lock, seq));
1374}
1375
1376#ifdef CONFIG_HIGH_RES_TIMERS
1377/**
1378 * ktime_get_update_offsets - hrtimer helper
1379 * @offs_real: pointer to storage for monotonic -> realtime offset
1380 * @offs_boot: pointer to storage for monotonic -> boottime offset
1381 *
1382 * Returns current monotonic time and updates the offsets
1383 * Called from hrtimer_interupt() or retrigger_next_event()
1384 */
1385ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1386{
1387 struct timekeeper *tk = &timekeeper;
1388 ktime_t now;
1389 unsigned int seq;
1390 u64 secs, nsecs;
1391
1392 do {
1393 seq = read_seqbegin(&tk->lock);
1394
1395 secs = tk->xtime_sec;
1396 nsecs = timekeeping_get_ns(tk);
1397
1398 *offs_real = tk->offs_real;
1399 *offs_boot = tk->offs_boot;
1400 } while (read_seqretry(&tk->lock, seq));
1401
1402 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1403 now = ktime_sub(now, *offs_real);
1404 return now;
1405} 1138}
1406#endif
1407 1139
1408/** 1140/**
1409 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format 1141 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1410 */ 1142 */
1411ktime_t ktime_get_monotonic_offset(void) 1143ktime_t ktime_get_monotonic_offset(void)
1412{ 1144{
1413 struct timekeeper *tk = &timekeeper;
1414 unsigned long seq; 1145 unsigned long seq;
1415 struct timespec wtom; 1146 struct timespec wtom;
1416 1147
1417 do { 1148 do {
1418 seq = read_seqbegin(&tk->lock); 1149 seq = read_seqbegin(&xtime_lock);
1419 wtom = tk->wall_to_monotonic; 1150 wtom = wall_to_monotonic;
1420 } while (read_seqretry(&tk->lock, seq)); 1151 } while (read_seqretry(&xtime_lock, seq));
1421
1422 return timespec_to_ktime(wtom); 1152 return timespec_to_ktime(wtom);
1423} 1153}
1424EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1425 1154
1426/** 1155/**
1427 * xtime_update() - advances the timekeeping infrastructure 1156 * xtime_update() - advances the timekeeping infrastructure
@@ -1431,7 +1160,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1431 */ 1160 */
1432void xtime_update(unsigned long ticks) 1161void xtime_update(unsigned long ticks)
1433{ 1162{
1434 write_seqlock(&jiffies_lock); 1163 write_seqlock(&xtime_lock);
1435 do_timer(ticks); 1164 do_timer(ticks);
1436 write_sequnlock(&jiffies_lock); 1165 write_sequnlock(&xtime_lock);
1437} 1166}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164..3258455549f 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 { 167 {
168 struct tick_sched *ts = tick_get_tick_sched(cpu); 168 struct tick_sched *ts = tick_get_tick_sched(cpu);
169 P(nohz_mode); 169 P(nohz_mode);
170 P_ns(last_tick); 170 P_ns(idle_tick);
171 P(tick_stopped); 171 P(tick_stopped);
172 P(idle_jiffies); 172 P(idle_jiffies);
173 P(idle_calls); 173 P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
259 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 260 int cpu;
261 261
262 SEQ_printf(m, "Timer List Version: v0.7\n"); 262 SEQ_printf(m, "Timer List Version: v0.6\n");
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 265
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b55..a5d0a3a85dd 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
81/* 81/*
82 * Spinlock protecting the tables - not taken during lookup: 82 * Spinlock protecting the tables - not taken during lookup:
83 */ 83 */
84static DEFINE_RAW_SPINLOCK(table_lock); 84static DEFINE_SPINLOCK(table_lock);
85 85
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
188 prev = NULL; 188 prev = NULL;
189 curr = *head; 189 curr = *head;
190 190
191 raw_spin_lock(&table_lock); 191 spin_lock(&table_lock);
192 /* 192 /*
193 * Make sure we have not raced with another CPU: 193 * Make sure we have not raced with another CPU:
194 */ 194 */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
215 *head = curr; 215 *head = curr;
216 } 216 }
217 out_unlock: 217 out_unlock:
218 raw_spin_unlock(&table_lock); 218 spin_unlock(&table_lock);
219 219
220 return curr; 220 return curr;
221} 221}
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d0085848..8cff36119e4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/export.h> 23#include <linux/module.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/init.h> 26#include <linux/init.h>
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(jiffies_64);
63#define TVR_SIZE (1 << TVR_BITS) 63#define TVR_SIZE (1 << TVR_BITS)
64#define TVN_MASK (TVN_SIZE - 1) 64#define TVN_MASK (TVN_SIZE - 1)
65#define TVR_MASK (TVR_SIZE - 1) 65#define TVR_MASK (TVR_SIZE - 1)
66#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
67 66
68struct tvec { 67struct tvec {
69 struct list_head vec[TVN_SIZE]; 68 struct list_head vec[TVN_SIZE];
@@ -78,7 +77,6 @@ struct tvec_base {
78 struct timer_list *running_timer; 77 struct timer_list *running_timer;
79 unsigned long timer_jiffies; 78 unsigned long timer_jiffies;
80 unsigned long next_timer; 79 unsigned long next_timer;
81 unsigned long active_timers;
82 struct tvec_root tv1; 80 struct tvec_root tv1;
83 struct tvec tv2; 81 struct tvec tv2;
84 struct tvec tv3; 82 struct tvec tv3;
@@ -93,25 +91,24 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
93/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
94static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
95{ 93{
96 return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); 94 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
97} 95}
98 96
99static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) 97static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
100{ 98{
101 return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); 99 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
102} 100}
103 101
104static inline struct tvec_base *tbase_get_base(struct tvec_base *base) 102static inline void timer_set_deferrable(struct timer_list *timer)
105{ 103{
106 return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
107} 105}
108 106
109static inline void 107static inline void
110timer_set_base(struct timer_list *timer, struct tvec_base *new_base) 108timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
111{ 109{
112 unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; 110 timer->base = (struct tvec_base *)((unsigned long)(new_base) |
113 111 tbase_get_deferrable(timer->base));
114 timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
115} 112}
116 113
117static unsigned long round_jiffies_common(unsigned long j, int cpu, 114static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -333,8 +330,7 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
333} 330}
334EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
335 332
336static void 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
337__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
338{ 334{
339 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
340 unsigned long idx = expires - base->timer_jiffies; 336 unsigned long idx = expires - base->timer_jiffies;
@@ -360,12 +356,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
360 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 356 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
361 } else { 357 } else {
362 int i; 358 int i;
363 /* If the timeout is larger than MAX_TVAL (on 64-bit 359 /* If the timeout is larger than 0xffffffff on 64-bit
364 * architectures or with CONFIG_BASE_SMALL=1) then we 360 * architectures then we use the maximum timeout:
365 * use the maximum timeout.
366 */ 361 */
367 if (idx > MAX_TVAL) { 362 if (idx > 0xffffffffUL) {
368 idx = MAX_TVAL; 363 idx = 0xffffffffUL;
369 expires = idx + base->timer_jiffies; 364 expires = idx + base->timer_jiffies;
370 } 365 }
371 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 366 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
@@ -377,19 +372,6 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
377 list_add_tail(&timer->entry, vec); 372 list_add_tail(&timer->entry, vec);
378} 373}
379 374
380static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
381{
382 __internal_add_timer(base, timer);
383 /*
384 * Update base->active_timers and base->next_timer
385 */
386 if (!tbase_get_deferrable(timer->base)) {
387 if (time_before(timer->expires, base->next_timer))
388 base->next_timer = timer->expires;
389 base->active_timers++;
390 }
391}
392
393#ifdef CONFIG_TIMER_STATS 375#ifdef CONFIG_TIMER_STATS
394void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) 376void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
395{ 377{
@@ -445,12 +427,6 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
445 } 427 }
446} 428}
447 429
448/* Stub timer callback for improperly used timers. */
449static void stub_timer(unsigned long data)
450{
451 WARN_ON(1);
452}
453
454/* 430/*
455 * fixup_activate is called when: 431 * fixup_activate is called when:
456 * - an active object is activated 432 * - an active object is activated
@@ -474,8 +450,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
474 debug_object_activate(timer, &timer_debug_descr); 450 debug_object_activate(timer, &timer_debug_descr);
475 return 0; 451 return 0;
476 } else { 452 } else {
477 setup_timer(timer, stub_timer, 0); 453 WARN_ON_ONCE(1);
478 return 1;
479 } 454 }
480 return 0; 455 return 0;
481 456
@@ -505,40 +480,12 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
505 } 480 }
506} 481}
507 482
508/*
509 * fixup_assert_init is called when:
510 * - an untracked/uninit-ed object is found
511 */
512static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
513{
514 struct timer_list *timer = addr;
515
516 switch (state) {
517 case ODEBUG_STATE_NOTAVAILABLE:
518 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
519 /*
520 * This is not really a fixup. The timer was
521 * statically initialized. We just make sure that it
522 * is tracked in the object tracker.
523 */
524 debug_object_init(timer, &timer_debug_descr);
525 return 0;
526 } else {
527 setup_timer(timer, stub_timer, 0);
528 return 1;
529 }
530 default:
531 return 0;
532 }
533}
534
535static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
536 .name = "timer_list", 484 .name = "timer_list",
537 .debug_hint = timer_debug_hint, 485 .debug_hint = timer_debug_hint,
538 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
539 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
540 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
541 .fixup_assert_init = timer_fixup_assert_init,
542}; 489};
543 490
544static inline void debug_timer_init(struct timer_list *timer) 491static inline void debug_timer_init(struct timer_list *timer)
@@ -561,19 +508,16 @@ static inline void debug_timer_free(struct timer_list *timer)
561 debug_object_free(timer, &timer_debug_descr); 508 debug_object_free(timer, &timer_debug_descr);
562} 509}
563 510
564static inline void debug_timer_assert_init(struct timer_list *timer) 511static void __init_timer(struct timer_list *timer,
565{ 512 const char *name,
566 debug_object_assert_init(timer, &timer_debug_descr); 513 struct lock_class_key *key);
567}
568
569static void do_init_timer(struct timer_list *timer, unsigned int flags,
570 const char *name, struct lock_class_key *key);
571 514
572void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, 515void init_timer_on_stack_key(struct timer_list *timer,
573 const char *name, struct lock_class_key *key) 516 const char *name,
517 struct lock_class_key *key)
574{ 518{
575 debug_object_init_on_stack(timer, &timer_debug_descr); 519 debug_object_init_on_stack(timer, &timer_debug_descr);
576 do_init_timer(timer, flags, name, key); 520 __init_timer(timer, name, key);
577} 521}
578EXPORT_SYMBOL_GPL(init_timer_on_stack_key); 522EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
579 523
@@ -587,7 +531,6 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
587static inline void debug_timer_init(struct timer_list *timer) { } 531static inline void debug_timer_init(struct timer_list *timer) { }
588static inline void debug_timer_activate(struct timer_list *timer) { } 532static inline void debug_timer_activate(struct timer_list *timer) { }
589static inline void debug_timer_deactivate(struct timer_list *timer) { } 533static inline void debug_timer_deactivate(struct timer_list *timer) { }
590static inline void debug_timer_assert_init(struct timer_list *timer) { }
591#endif 534#endif
592 535
593static inline void debug_init(struct timer_list *timer) 536static inline void debug_init(struct timer_list *timer)
@@ -609,18 +552,12 @@ static inline void debug_deactivate(struct timer_list *timer)
609 trace_timer_cancel(timer); 552 trace_timer_cancel(timer);
610} 553}
611 554
612static inline void debug_assert_init(struct timer_list *timer) 555static void __init_timer(struct timer_list *timer,
613{ 556 const char *name,
614 debug_timer_assert_init(timer); 557 struct lock_class_key *key)
615}
616
617static void do_init_timer(struct timer_list *timer, unsigned int flags,
618 const char *name, struct lock_class_key *key)
619{ 558{
620 struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
621
622 timer->entry.next = NULL; 559 timer->entry.next = NULL;
623 timer->base = (void *)((unsigned long)base | flags); 560 timer->base = __raw_get_cpu_var(tvec_bases);
624 timer->slack = -1; 561 timer->slack = -1;
625#ifdef CONFIG_TIMER_STATS 562#ifdef CONFIG_TIMER_STATS
626 timer->start_site = NULL; 563 timer->start_site = NULL;
@@ -630,10 +567,22 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
630 lockdep_init_map(&timer->lockdep_map, name, key, 0); 567 lockdep_init_map(&timer->lockdep_map, name, key, 0);
631} 568}
632 569
570void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
571 const char *name,
572 struct lock_class_key *key,
573 void (*function)(unsigned long),
574 unsigned long data)
575{
576 timer->function = function;
577 timer->data = data;
578 init_timer_on_stack_key(timer, name, key);
579 timer_set_deferrable(timer);
580}
581EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
582
633/** 583/**
634 * init_timer_key - initialize a timer 584 * init_timer_key - initialize a timer
635 * @timer: the timer to be initialized 585 * @timer: the timer to be initialized
636 * @flags: timer flags
637 * @name: name of the timer 586 * @name: name of the timer
638 * @key: lockdep class key of the fake lock used for tracking timer 587 * @key: lockdep class key of the fake lock used for tracking timer
639 * sync lock dependencies 588 * sync lock dependencies
@@ -641,15 +590,26 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
641 * init_timer_key() must be done to a timer prior calling *any* of the 590 * init_timer_key() must be done to a timer prior calling *any* of the
642 * other timer functions. 591 * other timer functions.
643 */ 592 */
644void init_timer_key(struct timer_list *timer, unsigned int flags, 593void init_timer_key(struct timer_list *timer,
645 const char *name, struct lock_class_key *key) 594 const char *name,
595 struct lock_class_key *key)
646{ 596{
647 debug_init(timer); 597 debug_init(timer);
648 do_init_timer(timer, flags, name, key); 598 __init_timer(timer, name, key);
649} 599}
650EXPORT_SYMBOL(init_timer_key); 600EXPORT_SYMBOL(init_timer_key);
651 601
652static inline void detach_timer(struct timer_list *timer, bool clear_pending) 602void init_timer_deferrable_key(struct timer_list *timer,
603 const char *name,
604 struct lock_class_key *key)
605{
606 init_timer_key(timer, name, key);
607 timer_set_deferrable(timer);
608}
609EXPORT_SYMBOL(init_timer_deferrable_key);
610
611static inline void detach_timer(struct timer_list *timer,
612 int clear_pending)
653{ 613{
654 struct list_head *entry = &timer->entry; 614 struct list_head *entry = &timer->entry;
655 615
@@ -661,29 +621,6 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
661 entry->prev = LIST_POISON2; 621 entry->prev = LIST_POISON2;
662} 622}
663 623
664static inline void
665detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
666{
667 detach_timer(timer, true);
668 if (!tbase_get_deferrable(timer->base))
669 base->active_timers--;
670}
671
672static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
673 bool clear_pending)
674{
675 if (!timer_pending(timer))
676 return 0;
677
678 detach_timer(timer, clear_pending);
679 if (!tbase_get_deferrable(timer->base)) {
680 base->active_timers--;
681 if (timer->expires == base->next_timer)
682 base->next_timer = base->timer_jiffies;
683 }
684 return 1;
685}
686
687/* 624/*
688 * We are using hashed locking: holding per_cpu(tvec_bases).lock 625 * We are using hashed locking: holding per_cpu(tvec_bases).lock
689 * means that all timers which are tied to this base via timer->base are 626 * means that all timers which are tied to this base via timer->base are
@@ -729,9 +666,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
729 666
730 base = lock_timer_base(timer, &flags); 667 base = lock_timer_base(timer, &flags);
731 668
732 ret = detach_if_pending(timer, base, false); 669 if (timer_pending(timer)) {
733 if (!ret && pending_only) 670 detach_timer(timer, 0);
734 goto out_unlock; 671 if (timer->expires == base->next_timer &&
672 !tbase_get_deferrable(timer->base))
673 base->next_timer = base->timer_jiffies;
674 ret = 1;
675 } else {
676 if (pending_only)
677 goto out_unlock;
678 }
735 679
736 debug_activate(timer, expires); 680 debug_activate(timer, expires);
737 681
@@ -762,6 +706,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
762 } 706 }
763 707
764 timer->expires = expires; 708 timer->expires = expires;
709 if (time_before(timer->expires, base->next_timer) &&
710 !tbase_get_deferrable(timer->base))
711 base->next_timer = timer->expires;
765 internal_add_timer(base, timer); 712 internal_add_timer(base, timer);
766 713
767out_unlock: 714out_unlock:
@@ -868,13 +815,7 @@ EXPORT_SYMBOL(mod_timer);
868 * 815 *
869 * mod_timer_pinned() is a way to update the expire field of an 816 * mod_timer_pinned() is a way to update the expire field of an
870 * active timer (if the timer is inactive it will be activated) 817 * active timer (if the timer is inactive it will be activated)
871 * and to ensure that the timer is scheduled on the current CPU. 818 * and not allow the timer to be migrated to a different CPU.
872 *
873 * Note that this does not prevent the timer from being migrated
874 * when the current CPU goes offline. If this is a problem for
875 * you, use CPU-hotplug notifiers to handle it correctly, for
876 * example, cancelling the timer when the corresponding CPU goes
877 * offline.
878 * 819 *
879 * mod_timer_pinned(timer, expires) is equivalent to: 820 * mod_timer_pinned(timer, expires) is equivalent to:
880 * 821 *
@@ -927,6 +868,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
927 spin_lock_irqsave(&base->lock, flags); 868 spin_lock_irqsave(&base->lock, flags);
928 timer_set_base(timer, base); 869 timer_set_base(timer, base);
929 debug_activate(timer, timer->expires); 870 debug_activate(timer, timer->expires);
871 if (time_before(timer->expires, base->next_timer) &&
872 !tbase_get_deferrable(timer->base))
873 base->next_timer = timer->expires;
930 internal_add_timer(base, timer); 874 internal_add_timer(base, timer);
931 /* 875 /*
932 * Check whether the other CPU is idle and needs to be 876 * Check whether the other CPU is idle and needs to be
@@ -958,12 +902,16 @@ int del_timer(struct timer_list *timer)
958 unsigned long flags; 902 unsigned long flags;
959 int ret = 0; 903 int ret = 0;
960 904
961 debug_assert_init(timer);
962
963 timer_stats_timer_clear_start_info(timer); 905 timer_stats_timer_clear_start_info(timer);
964 if (timer_pending(timer)) { 906 if (timer_pending(timer)) {
965 base = lock_timer_base(timer, &flags); 907 base = lock_timer_base(timer, &flags);
966 ret = detach_if_pending(timer, base, true); 908 if (timer_pending(timer)) {
909 detach_timer(timer, 1);
910 if (timer->expires == base->next_timer &&
911 !tbase_get_deferrable(timer->base))
912 base->next_timer = base->timer_jiffies;
913 ret = 1;
914 }
967 spin_unlock_irqrestore(&base->lock, flags); 915 spin_unlock_irqrestore(&base->lock, flags);
968 } 916 }
969 917
@@ -984,14 +932,21 @@ int try_to_del_timer_sync(struct timer_list *timer)
984 unsigned long flags; 932 unsigned long flags;
985 int ret = -1; 933 int ret = -1;
986 934
987 debug_assert_init(timer);
988
989 base = lock_timer_base(timer, &flags); 935 base = lock_timer_base(timer, &flags);
990 936
991 if (base->running_timer != timer) { 937 if (base->running_timer == timer)
992 timer_stats_timer_clear_start_info(timer); 938 goto out;
993 ret = detach_if_pending(timer, base, true); 939
940 timer_stats_timer_clear_start_info(timer);
941 ret = 0;
942 if (timer_pending(timer)) {
943 detach_timer(timer, 1);
944 if (timer->expires == base->next_timer &&
945 !tbase_get_deferrable(timer->base))
946 base->next_timer = base->timer_jiffies;
947 ret = 1;
994 } 948 }
949out:
995 spin_unlock_irqrestore(&base->lock, flags); 950 spin_unlock_irqrestore(&base->lock, flags);
996 951
997 return ret; 952 return ret;
@@ -1009,14 +964,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
1009 * 964 *
1010 * Synchronization rules: Callers must prevent restarting of the timer, 965 * Synchronization rules: Callers must prevent restarting of the timer,
1011 * otherwise this function is meaningless. It must not be called from 966 * otherwise this function is meaningless. It must not be called from
1012 * interrupt contexts unless the timer is an irqsafe one. The caller must 967 * interrupt contexts. The caller must not hold locks which would prevent
1013 * not hold locks which would prevent completion of the timer's 968 * completion of the timer's handler. The timer's handler must not call
1014 * handler. The timer's handler must not call add_timer_on(). Upon exit the 969 * add_timer_on(). Upon exit the timer is not queued and the handler is
1015 * timer is not queued and the handler is not running on any CPU. 970 * not running on any CPU.
1016 * 971 *
1017 * Note: For !irqsafe timers, you must not hold locks that are held in 972 * Note: You must not hold locks that are held in interrupt context
1018 * interrupt context while calling this function. Even if the lock has 973 * while calling this function. Even if the lock has nothing to do
1019 * nothing to do with the timer in question. Here's why: 974 * with the timer in question. Here's why:
1020 * 975 *
1021 * CPU0 CPU1 976 * CPU0 CPU1
1022 * ---- ---- 977 * ---- ----
@@ -1053,7 +1008,7 @@ int del_timer_sync(struct timer_list *timer)
1053 * don't use it in hardirq context, because it 1008 * don't use it in hardirq context, because it
1054 * could lead to deadlock. 1009 * could lead to deadlock.
1055 */ 1010 */
1056 WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); 1011 WARN_ON(in_irq());
1057 for (;;) { 1012 for (;;) {
1058 int ret = try_to_del_timer_sync(timer); 1013 int ret = try_to_del_timer_sync(timer);
1059 if (ret >= 0) 1014 if (ret >= 0)
@@ -1078,8 +1033,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1078 */ 1033 */
1079 list_for_each_entry_safe(timer, tmp, &tv_list, entry) { 1034 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1080 BUG_ON(tbase_get_base(timer->base) != base); 1035 BUG_ON(tbase_get_base(timer->base) != base);
1081 /* No accounting, while moving them */ 1036 internal_add_timer(base, timer);
1082 __internal_add_timer(base, timer);
1083 } 1037 }
1084 1038
1085 return index; 1039 return index;
@@ -1098,9 +1052,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1098 * warnings as well as problems when looking into 1052 * warnings as well as problems when looking into
1099 * timer->lockdep_map, make a copy and use that here. 1053 * timer->lockdep_map, make a copy and use that here.
1100 */ 1054 */
1101 struct lockdep_map lockdep_map; 1055 struct lockdep_map lockdep_map = timer->lockdep_map;
1102
1103 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1104#endif 1056#endif
1105 /* 1057 /*
1106 * Couple the lock chain with the lock chain at 1058 * Couple the lock chain with the lock chain at
@@ -1160,27 +1112,19 @@ static inline void __run_timers(struct tvec_base *base)
1160 while (!list_empty(head)) { 1112 while (!list_empty(head)) {
1161 void (*fn)(unsigned long); 1113 void (*fn)(unsigned long);
1162 unsigned long data; 1114 unsigned long data;
1163 bool irqsafe;
1164 1115
1165 timer = list_first_entry(head, struct timer_list,entry); 1116 timer = list_first_entry(head, struct timer_list,entry);
1166 fn = timer->function; 1117 fn = timer->function;
1167 data = timer->data; 1118 data = timer->data;
1168 irqsafe = tbase_get_irqsafe(timer->base);
1169 1119
1170 timer_stats_account_timer(timer); 1120 timer_stats_account_timer(timer);
1171 1121
1172 base->running_timer = timer; 1122 base->running_timer = timer;
1173 detach_expired_timer(timer, base); 1123 detach_timer(timer, 1);
1174 1124
1175 if (irqsafe) { 1125 spin_unlock_irq(&base->lock);
1176 spin_unlock(&base->lock); 1126 call_timer_fn(timer, fn, data);
1177 call_timer_fn(timer, fn, data); 1127 spin_lock_irq(&base->lock);
1178 spin_lock(&base->lock);
1179 } else {
1180 spin_unlock_irq(&base->lock);
1181 call_timer_fn(timer, fn, data);
1182 spin_lock_irq(&base->lock);
1183 }
1184 } 1128 }
1185 } 1129 }
1186 base->running_timer = NULL; 1130 base->running_timer = NULL;
@@ -1314,21 +1258,18 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1314unsigned long get_next_timer_interrupt(unsigned long now) 1258unsigned long get_next_timer_interrupt(unsigned long now)
1315{ 1259{
1316 struct tvec_base *base = __this_cpu_read(tvec_bases); 1260 struct tvec_base *base = __this_cpu_read(tvec_bases);
1317 unsigned long expires = now + NEXT_TIMER_MAX_DELTA; 1261 unsigned long expires;
1318 1262
1319 /* 1263 /*
1320 * Pretend that there is no timer pending if the cpu is offline. 1264 * Pretend that there is no timer pending if the cpu is offline.
1321 * Possible pending timers will be migrated later to an active cpu. 1265 * Possible pending timers will be migrated later to an active cpu.
1322 */ 1266 */
1323 if (cpu_is_offline(smp_processor_id())) 1267 if (cpu_is_offline(smp_processor_id()))
1324 return expires; 1268 return now + NEXT_TIMER_MAX_DELTA;
1325
1326 spin_lock(&base->lock); 1269 spin_lock(&base->lock);
1327 if (base->active_timers) { 1270 if (time_before_eq(base->next_timer, base->timer_jiffies))
1328 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1271 base->next_timer = __next_timer_interrupt(base);
1329 base->next_timer = __next_timer_interrupt(base); 1272 expires = base->next_timer;
1330 expires = base->next_timer;
1331 }
1332 spin_unlock(&base->lock); 1273 spin_unlock(&base->lock);
1333 1274
1334 if (time_before_eq(expires, now)) 1275 if (time_before_eq(expires, now))
@@ -1395,6 +1336,13 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1395 1336
1396#endif 1337#endif
1397 1338
1339#ifndef __alpha__
1340
1341/*
1342 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
1343 * should be moved into arch/i386 instead?
1344 */
1345
1398/** 1346/**
1399 * sys_getpid - return the thread group id of the current process 1347 * sys_getpid - return the thread group id of the current process
1400 * 1348 *
@@ -1420,7 +1368,7 @@ SYSCALL_DEFINE0(getppid)
1420 int pid; 1368 int pid;
1421 1369
1422 rcu_read_lock(); 1370 rcu_read_lock();
1423 pid = task_tgid_vnr(rcu_dereference(current->real_parent)); 1371 pid = task_tgid_vnr(current->real_parent);
1424 rcu_read_unlock(); 1372 rcu_read_unlock();
1425 1373
1426 return pid; 1374 return pid;
@@ -1429,27 +1377,29 @@ SYSCALL_DEFINE0(getppid)
1429SYSCALL_DEFINE0(getuid) 1377SYSCALL_DEFINE0(getuid)
1430{ 1378{
1431 /* Only we change this so SMP safe */ 1379 /* Only we change this so SMP safe */
1432 return from_kuid_munged(current_user_ns(), current_uid()); 1380 return current_uid();
1433} 1381}
1434 1382
1435SYSCALL_DEFINE0(geteuid) 1383SYSCALL_DEFINE0(geteuid)
1436{ 1384{
1437 /* Only we change this so SMP safe */ 1385 /* Only we change this so SMP safe */
1438 return from_kuid_munged(current_user_ns(), current_euid()); 1386 return current_euid();
1439} 1387}
1440 1388
1441SYSCALL_DEFINE0(getgid) 1389SYSCALL_DEFINE0(getgid)
1442{ 1390{
1443 /* Only we change this so SMP safe */ 1391 /* Only we change this so SMP safe */
1444 return from_kgid_munged(current_user_ns(), current_gid()); 1392 return current_gid();
1445} 1393}
1446 1394
1447SYSCALL_DEFINE0(getegid) 1395SYSCALL_DEFINE0(getegid)
1448{ 1396{
1449 /* Only we change this so SMP safe */ 1397 /* Only we change this so SMP safe */
1450 return from_kgid_munged(current_user_ns(), current_egid()); 1398 return current_egid();
1451} 1399}
1452 1400
1401#endif
1402
1453static void process_timeout(unsigned long __data) 1403static void process_timeout(unsigned long __data)
1454{ 1404{
1455 wake_up_process((struct task_struct *)__data); 1405 wake_up_process((struct task_struct *)__data);
@@ -1696,7 +1646,6 @@ static int __cpuinit init_timers_cpu(int cpu)
1696 1646
1697 base->timer_jiffies = jiffies; 1647 base->timer_jiffies = jiffies;
1698 base->next_timer = base->timer_jiffies; 1648 base->next_timer = base->timer_jiffies;
1699 base->active_timers = 0;
1700 return 0; 1649 return 0;
1701} 1650}
1702 1651
@@ -1707,9 +1656,11 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1707 1656
1708 while (!list_empty(head)) { 1657 while (!list_empty(head)) {
1709 timer = list_first_entry(head, struct timer_list, entry); 1658 timer = list_first_entry(head, struct timer_list, entry);
1710 /* We ignore the accounting on the dying cpu */ 1659 detach_timer(timer, 0);
1711 detach_timer(timer, false);
1712 timer_set_base(timer, new_base); 1660 timer_set_base(timer, new_base);
1661 if (time_before(timer->expires, new_base->next_timer) &&
1662 !tbase_get_deferrable(timer->base))
1663 new_base->next_timer = timer->expires;
1713 internal_add_timer(new_base, timer); 1664 internal_add_timer(new_base, timer);
1714 } 1665 }
1715} 1666}
@@ -1779,13 +1730,9 @@ static struct notifier_block __cpuinitdata timers_nb = {
1779 1730
1780void __init init_timers(void) 1731void __init init_timers(void)
1781{ 1732{
1782 int err; 1733 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1783 1734 (void *)(long)smp_processor_id());
1784 /* ensure there are enough low bits for flags in timer->base pointer */
1785 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1786 1735
1787 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1788 (void *)(long)smp_processor_id());
1789 init_timer_stats(); 1736 init_timer_stats();
1790 1737
1791 BUG_ON(err != NOTIFY_OK); 1738 BUG_ON(err != NOTIFY_OK);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485..93168c0f991 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,11 +49,6 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_FENTRY
53 bool
54 help
55 Arch supports the gcc options -pg with -mfentry
56
57config HAVE_C_RECORDMCOUNT 52config HAVE_C_RECORDMCOUNT
58 bool 53 bool
59 help 54 help
@@ -62,12 +57,8 @@ config HAVE_C_RECORDMCOUNT
62config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
63 bool 58 bool
64 59
65config TRACE_CLOCK
66 bool
67
68config RING_BUFFER 60config RING_BUFFER
69 bool 61 bool
70 select TRACE_CLOCK
71 62
72config FTRACE_NMI_ENTER 63config FTRACE_NMI_ENTER
73 bool 64 bool
@@ -118,8 +109,6 @@ config TRACING
118 select NOP_TRACER 109 select NOP_TRACER
119 select BINARY_PRINTF 110 select BINARY_PRINTF
120 select EVENT_TRACING 111 select EVENT_TRACING
121 select TRACE_CLOCK
122 select IRQ_WORK
123 112
124config GENERIC_TRACER 113config GENERIC_TRACER
125 bool 114 bool
@@ -152,6 +141,7 @@ if FTRACE
152config FUNCTION_TRACER 141config FUNCTION_TRACER
153 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
154 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
155 select KALLSYMS 145 select KALLSYMS
156 select GENERIC_TRACER 146 select GENERIC_TRACER
157 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
@@ -282,7 +272,7 @@ config PROFILE_ANNOTATED_BRANCHES
282 bool "Trace likely/unlikely profiler" 272 bool "Trace likely/unlikely profiler"
283 select TRACE_BRANCH_PROFILING 273 select TRACE_BRANCH_PROFILING
284 help 274 help
285 This tracer profiles all likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
286 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
287 277
288 /sys/kernel/debug/tracing/trace_stat/branch_annotated 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -383,7 +373,6 @@ config KPROBE_EVENT
383 depends on HAVE_REGS_AND_STACK_ACCESS_API 373 depends on HAVE_REGS_AND_STACK_ACCESS_API
384 bool "Enable kprobes-based dynamic events" 374 bool "Enable kprobes-based dynamic events"
385 select TRACING 375 select TRACING
386 select PROBE_EVENTS
387 default y 376 default y
388 help 377 help
389 This allows the user to add tracing events (similar to tracepoints) 378 This allows the user to add tracing events (similar to tracepoints)
@@ -396,25 +385,6 @@ config KPROBE_EVENT
396 This option is also required by perf-probe subcommand of perf tools. 385 This option is also required by perf-probe subcommand of perf tools.
397 If you want to use perf tools, this option is strongly recommended. 386 If you want to use perf tools, this option is strongly recommended.
398 387
399config UPROBE_EVENT
400 bool "Enable uprobes-based dynamic events"
401 depends on ARCH_SUPPORTS_UPROBES
402 depends on MMU
403 select UPROBES
404 select PROBE_EVENTS
405 select TRACING
406 default n
407 help
408 This allows the user to add tracing events on top of userspace
409 dynamic events (similar to tracepoints) on the fly via the trace
410 events interface. Those events can be inserted wherever uprobes
411 can probe, and record various registers.
412 This option is required if you plan to use perf-probe subcommand
413 of perf tools on user space applications.
414
415config PROBE_EVENTS
416 def_bool n
417
418config DYNAMIC_FTRACE 388config DYNAMIC_FTRACE
419 bool "enable/disable ftrace tracepoints dynamically" 389 bool "enable/disable ftrace tracepoints dynamically"
420 depends on FUNCTION_TRACER 390 depends on FUNCTION_TRACER
@@ -517,6 +487,39 @@ config RING_BUFFER_BENCHMARK
517 487
518 If unsure, say N. 488 If unsure, say N.
519 489
490config TRACELEVEL
491 bool "Add capability to prioritize traces"
492 depends on EVENT_TRACING
493 help
494 This option allows subsystem programmers to add priorities to trace
495 events by calling to tracelevel_register. Traces of high priority
496 will automatically be enabled on kernel boot, and users can change
497 the the trace level in a kernel parameter.
498
499config TRACEDUMP
500 bool "Dumping functionality for ftrace"
501 depends on FUNCTION_TRACER
502 help
503 This option adds functionality to dump tracing data in several forms
504 Data can be dumped in ascii form or as raw pages from the tracing
505 ring buffers, along with the saved cmdlines. This is specified by
506 the module parameter tracedump_ascii. Data will be compressed
507 using zlib.
508
509config TRACEDUMP_PANIC
510 bool "Tracedump to console on panic"
511 depends on TRACEDUMP
512 help
513 With this option, tracedump will automatically dump to the console
514 on a kernel panic.
515
516config TRACEDUMP_PROCFS
517 bool "Tracedump via proc file"
518 depends on TRACEDUMP
519 help
520 With this option, tracedump can be dumped from user space by reading
521 from /proc/tracedump.
522
520endif # FTRACE 523endif # FTRACE
521 524
522endif # TRACING_SUPPORT 525endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b7..1360a1a90d5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,21 +5,21 @@ ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST
9# selftest needs instrumentation 8# selftest needs instrumentation
10CFLAGS_trace_selftest_dynamic.o = -pg 9CFLAGS_trace_selftest_dynamic.o = -pg
11obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
12endif 11endif
13endif
14 12
15# If unlikely tracing is enabled, do not trace these files 13# If unlikely tracing is enabled, do not trace these files
16ifdef CONFIG_TRACING_BRANCHES 14ifdef CONFIG_TRACING_BRANCHES
17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
18endif 16endif
19 17
20CFLAGS_trace_events_filter.o := -I$(src) 18#
21 19# Make the trace clocks available generally: it's infrastructure
22obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o 20# relied on by ptrace for example:
21#
22obj-y += trace_clock.o
23 23
24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
@@ -39,6 +39,7 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
44obj-$(CONFIG_EVENT_TRACING) += blktrace.o 45obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -52,13 +53,10 @@ endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
53obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
54obj-$(CONFIG_TRACEPOINTS) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
55ifeq ($(CONFIG_PM_RUNTIME),y)
56obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
57endif
58ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
59obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
60endif 58endif
61obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o 59obj-$(CONFIG_TRACELEVEL) += tracelevel.o
62obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o 60obj-$(CONFIG_TRACEDUMP) += tracedump.o
63 61
64libftrace-y := ftrace.o 62libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741..7c910a5593a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/export.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -311,6 +310,13 @@ int blk_trace_remove(struct request_queue *q)
311} 310}
312EXPORT_SYMBOL_GPL(blk_trace_remove); 311EXPORT_SYMBOL_GPL(blk_trace_remove);
313 312
313static int blk_dropped_open(struct inode *inode, struct file *filp)
314{
315 filp->private_data = inode->i_private;
316
317 return 0;
318}
319
314static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, 320static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
315 size_t count, loff_t *ppos) 321 size_t count, loff_t *ppos)
316{ 322{
@@ -324,11 +330,18 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
324 330
325static const struct file_operations blk_dropped_fops = { 331static const struct file_operations blk_dropped_fops = {
326 .owner = THIS_MODULE, 332 .owner = THIS_MODULE,
327 .open = simple_open, 333 .open = blk_dropped_open,
328 .read = blk_dropped_read, 334 .read = blk_dropped_read,
329 .llseek = default_llseek, 335 .llseek = default_llseek,
330}; 336};
331 337
338static int blk_msg_open(struct inode *inode, struct file *filp)
339{
340 filp->private_data = inode->i_private;
341
342 return 0;
343}
344
332static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, 345static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
333 size_t count, loff_t *ppos) 346 size_t count, loff_t *ppos)
334{ 347{
@@ -357,7 +370,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
357 370
358static const struct file_operations blk_msg_fops = { 371static const struct file_operations blk_msg_fops = {
359 .owner = THIS_MODULE, 372 .owner = THIS_MODULE,
360 .open = simple_open, 373 .open = blk_msg_open,
361 .write = blk_msg_write, 374 .write = blk_msg_write,
362 .llseek = noop_llseek, 375 .llseek = noop_llseek,
363}; 376};
@@ -388,7 +401,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
388 401
389static struct dentry *blk_create_buf_file_callback(const char *filename, 402static struct dentry *blk_create_buf_file_callback(const char *filename,
390 struct dentry *parent, 403 struct dentry *parent,
391 umode_t mode, 404 int mode,
392 struct rchan_buf *buf, 405 struct rchan_buf *buf,
393 int *is_global) 406 int *is_global)
394{ 407{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3ffe4c5ad3f..798b16cd40f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 Nadia Yvette Chambers 13 * Copyright (C) 2004 William Lee Irwin III
14 */ 14 */
15 15
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
@@ -22,13 +22,10 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/bsearch.h>
26#include <linux/module.h>
27#include <linux/ftrace.h> 25#include <linux/ftrace.h>
28#include <linux/sysctl.h> 26#include <linux/sysctl.h>
29#include <linux/slab.h> 27#include <linux/slab.h>
30#include <linux/ctype.h> 28#include <linux/ctype.h>
31#include <linux/sort.h>
32#include <linux/list.h> 29#include <linux/list.h>
33#include <linux/hash.h> 30#include <linux/hash.h>
34#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
@@ -62,22 +59,12 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 59#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 60#define FTRACE_HASH_MAX_BITS 12
64 61
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66
67static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
70};
71
72/* ftrace_enabled is a method to turn ftrace on or off */ 62/* ftrace_enabled is a method to turn ftrace on or off */
73int ftrace_enabled __read_mostly; 63int ftrace_enabled __read_mostly;
74static int last_ftrace_enabled; 64static int last_ftrace_enabled;
75 65
76/* Quick disabling of function tracer. */ 66/* Quick disabling of function tracer. */
77int function_trace_stop __read_mostly; 67int function_trace_stop;
78
79/* Current function tracing op */
80struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
81 68
82/* List for set_ftrace_pid's pids. */ 69/* List for set_ftrace_pid's pids. */
83LIST_HEAD(ftrace_pids); 70LIST_HEAD(ftrace_pids);
@@ -94,43 +81,20 @@ static int ftrace_disabled __read_mostly;
94 81
95static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
96 83
84static struct ftrace_ops ftrace_list_end __read_mostly = {
85 .func = ftrace_stub,
86};
87
97static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
98static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
99static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
100ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
101ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
102static struct ftrace_ops global_ops; 94static struct ftrace_ops global_ops;
103static struct ftrace_ops control_ops;
104
105#if ARCH_SUPPORTS_FTRACE_OPS
106static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
107 struct ftrace_ops *op, struct pt_regs *regs);
108#else
109/* See comment below, where ftrace_ops_list_func is defined */
110static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif
113
114/**
115 * ftrace_nr_registered_ops - return number of ops registered
116 *
117 * Returns the number of ftrace_ops registered and tracing functions
118 */
119int ftrace_nr_registered_ops(void)
120{
121 struct ftrace_ops *ops;
122 int cnt = 0;
123
124 mutex_lock(&ftrace_lock);
125 95
126 for (ops = ftrace_ops_list; 96static void
127 ops != &ftrace_list_end; ops = ops->next) 97ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
128 cnt++;
129
130 mutex_unlock(&ftrace_lock);
131
132 return cnt;
133}
134 98
135/* 99/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we 100 * Traverse the ftrace_global_list, invoking all entries. The reason that we
@@ -141,29 +105,29 @@ int ftrace_nr_registered_ops(void)
141 * 105 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations! 106 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */ 107 */
144static void 108static void ftrace_global_list_func(unsigned long ip,
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 109 unsigned long parent_ip)
146 struct ftrace_ops *op, struct pt_regs *regs)
147{ 110{
111 struct ftrace_ops *op;
112
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 113 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
149 return; 114 return;
150 115
151 trace_recursion_set(TRACE_GLOBAL_BIT); 116 trace_recursion_set(TRACE_GLOBAL_BIT);
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/ 117 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) { 118 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 119 op->func(ip, parent_ip);
155 op = rcu_dereference_raw(op->next); /*see above*/ 120 op = rcu_dereference_raw(op->next); /*see above*/
156 }; 121 };
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 122 trace_recursion_clear(TRACE_GLOBAL_BIT);
158} 123}
159 124
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 125static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
161 struct ftrace_ops *op, struct pt_regs *regs)
162{ 126{
163 if (!test_tsk_trace_trace(current)) 127 if (!test_tsk_trace_trace(current))
164 return; 128 return;
165 129
166 ftrace_pid_function(ip, parent_ip, op, regs); 130 ftrace_pid_function(ip, parent_ip);
167} 131}
168 132
169static void set_ftrace_pid_function(ftrace_func_t func) 133static void set_ftrace_pid_function(ftrace_func_t func)
@@ -182,34 +146,24 @@ static void set_ftrace_pid_function(ftrace_func_t func)
182void clear_ftrace_function(void) 146void clear_ftrace_function(void)
183{ 147{
184 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
185 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
186} 152}
187 153
188static void control_ops_disable_all(struct ftrace_ops *ops) 154#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
189{ 155/*
190 int cpu; 156 * For those archs that do not test ftrace_trace_stop in their
191 157 * mcount call site, we need to do it from C.
192 for_each_possible_cpu(cpu) 158 */
193 *per_cpu_ptr(ops->disabled, cpu) = 1; 159static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
194}
195
196static int control_ops_alloc(struct ftrace_ops *ops)
197{ 160{
198 int __percpu *disabled; 161 if (function_trace_stop)
199 162 return;
200 disabled = alloc_percpu(int);
201 if (!disabled)
202 return -ENOMEM;
203 163
204 ops->disabled = disabled; 164 __ftrace_trace_function(ip, parent_ip);
205 control_ops_disable_all(ops);
206 return 0;
207}
208
209static void control_ops_free(struct ftrace_ops *ops)
210{
211 free_percpu(ops->disabled);
212} 165}
166#endif
213 167
214static void update_global_ops(void) 168static void update_global_ops(void)
215{ 169{
@@ -243,27 +197,27 @@ static void update_ftrace_function(void)
243 197
244 /* 198 /*
245 * If we are at the end of the list and this ops is 199 * If we are at the end of the list and this ops is
246 * recursion safe and not dynamic and the arch supports passing ops, 200 * not dynamic, then have the mcount trampoline call
247 * then have the mcount trampoline call the function directly. 201 * the function directly
248 */ 202 */
249 if (ftrace_ops_list == &ftrace_list_end || 203 if (ftrace_ops_list == &ftrace_list_end ||
250 (ftrace_ops_list->next == &ftrace_list_end && 204 (ftrace_ops_list->next == &ftrace_list_end &&
251 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && 205 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
252 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
253 !FTRACE_FORCE_LIST_FUNC)) {
254 /* Set the ftrace_ops that the arch callback uses */
255 if (ftrace_ops_list == &global_ops)
256 function_trace_op = ftrace_global_list;
257 else
258 function_trace_op = ftrace_ops_list;
259 func = ftrace_ops_list->func; 206 func = ftrace_ops_list->func;
260 } else { 207 else
261 /* Just use the default ftrace_ops */
262 function_trace_op = &ftrace_list_end;
263 func = ftrace_ops_list_func; 208 func = ftrace_ops_list_func;
264 }
265 209
210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
266 ftrace_trace_function = func; 211 ftrace_trace_function = func;
212#else
213#ifdef CONFIG_DYNAMIC_FTRACE
214 /* do not update till all functions have been modified */
215 __ftrace_trace_function_delay = func;
216#else
217 __ftrace_trace_function = func;
218#endif
219 ftrace_trace_function = ftrace_test_stop_func;
220#endif
267} 221}
268 222
269static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 223static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -302,29 +256,9 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
302 return 0; 256 return 0;
303} 257}
304 258
305static void add_ftrace_list_ops(struct ftrace_ops **list,
306 struct ftrace_ops *main_ops,
307 struct ftrace_ops *ops)
308{
309 int first = *list == &ftrace_list_end;
310 add_ftrace_ops(list, ops);
311 if (first)
312 add_ftrace_ops(&ftrace_ops_list, main_ops);
313}
314
315static int remove_ftrace_list_ops(struct ftrace_ops **list,
316 struct ftrace_ops *main_ops,
317 struct ftrace_ops *ops)
318{
319 int ret = remove_ftrace_ops(list, ops);
320 if (!ret && *list == &ftrace_list_end)
321 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
322 return ret;
323}
324
325static int __register_ftrace_function(struct ftrace_ops *ops) 259static int __register_ftrace_function(struct ftrace_ops *ops)
326{ 260{
327 if (unlikely(ftrace_disabled)) 261 if (ftrace_disabled)
328 return -ENODEV; 262 return -ENODEV;
329 263
330 if (FTRACE_WARN_ON(ops == &global_ops)) 264 if (FTRACE_WARN_ON(ops == &global_ops))
@@ -333,34 +267,15 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
333 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 267 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
334 return -EBUSY; 268 return -EBUSY;
335 269
336 /* We don't support both control and global flags set. */
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL;
339
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
341 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
344 * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
345 */
346 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
347 !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
348 return -EINVAL;
349
350 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
351 ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
352#endif
353
354 if (!core_kernel_data((unsigned long)ops)) 270 if (!core_kernel_data((unsigned long)ops))
355 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 271 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
356 272
357 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 273 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
358 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); 274 int first = ftrace_global_list == &ftrace_list_end;
275 add_ftrace_ops(&ftrace_global_list, ops);
359 ops->flags |= FTRACE_OPS_FL_ENABLED; 276 ops->flags |= FTRACE_OPS_FL_ENABLED;
360 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { 277 if (first)
361 if (control_ops_alloc(ops)) 278 add_ftrace_ops(&ftrace_ops_list, &global_ops);
362 return -ENOMEM;
363 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
364 } else 279 } else
365 add_ftrace_ops(&ftrace_ops_list, ops); 280 add_ftrace_ops(&ftrace_ops_list, ops);
366 281
@@ -384,23 +299,11 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
384 return -EINVAL; 299 return -EINVAL;
385 300
386 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 301 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
387 ret = remove_ftrace_list_ops(&ftrace_global_list, 302 ret = remove_ftrace_ops(&ftrace_global_list, ops);
388 &global_ops, ops); 303 if (!ret && ftrace_global_list == &ftrace_list_end)
304 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
389 if (!ret) 305 if (!ret)
390 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 306 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
391 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
392 ret = remove_ftrace_list_ops(&ftrace_control_list,
393 &control_ops, ops);
394 if (!ret) {
395 /*
396 * The ftrace_ops is now removed from the list,
397 * so there'll be no new users. We must ensure
398 * all current users are done before we free
399 * the control data.
400 */
401 synchronize_sched();
402 control_ops_free(ops);
403 }
404 } else 307 } else
405 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 308 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
406 309
@@ -799,8 +702,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
799} 702}
800 703
801static void 704static void
802function_profile_call(unsigned long ip, unsigned long parent_ip, 705function_profile_call(unsigned long ip, unsigned long parent_ip)
803 struct ftrace_ops *ops, struct pt_regs *regs)
804{ 706{
805 struct ftrace_profile_stat *stat; 707 struct ftrace_profile_stat *stat;
806 struct ftrace_profile *rec; 708 struct ftrace_profile *rec;
@@ -830,7 +732,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
830#ifdef CONFIG_FUNCTION_GRAPH_TRACER 732#ifdef CONFIG_FUNCTION_GRAPH_TRACER
831static int profile_graph_entry(struct ftrace_graph_ent *trace) 733static int profile_graph_entry(struct ftrace_graph_ent *trace)
832{ 734{
833 function_profile_call(trace->func, 0, NULL, NULL); 735 function_profile_call(trace->func, 0);
834 return 1; 736 return 1;
835} 737}
836 738
@@ -890,7 +792,6 @@ static void unregister_ftrace_profiler(void)
890#else 792#else
891static struct ftrace_ops ftrace_profile_ops __read_mostly = { 793static struct ftrace_ops ftrace_profile_ops __read_mostly = {
892 .func = function_profile_call, 794 .func = function_profile_call,
893 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
894}; 795};
895 796
896static int register_ftrace_profiler(void) 797static int register_ftrace_profiler(void)
@@ -1045,6 +946,13 @@ struct ftrace_func_probe {
1045 struct rcu_head rcu; 946 struct rcu_head rcu;
1046}; 947};
1047 948
949enum {
950 FTRACE_UPDATE_CALLS = (1 << 0),
951 FTRACE_DISABLE_CALLS = (1 << 1),
952 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
953 FTRACE_START_FUNC_RET = (1 << 3),
954 FTRACE_STOP_FUNC_RET = (1 << 4),
955};
1048struct ftrace_func_entry { 956struct ftrace_func_entry {
1049 struct hlist_node hlist; 957 struct hlist_node hlist;
1050 unsigned long ip; 958 unsigned long ip;
@@ -1073,22 +981,20 @@ static struct ftrace_ops global_ops = {
1073 .func = ftrace_stub, 981 .func = ftrace_stub,
1074 .notrace_hash = EMPTY_HASH, 982 .notrace_hash = EMPTY_HASH,
1075 .filter_hash = EMPTY_HASH, 983 .filter_hash = EMPTY_HASH,
1076 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
1077}; 984};
1078 985
986static struct dyn_ftrace *ftrace_new_addrs;
987
1079static DEFINE_MUTEX(ftrace_regex_lock); 988static DEFINE_MUTEX(ftrace_regex_lock);
1080 989
1081struct ftrace_page { 990struct ftrace_page {
1082 struct ftrace_page *next; 991 struct ftrace_page *next;
1083 struct dyn_ftrace *records;
1084 int index; 992 int index;
1085 int size; 993 struct dyn_ftrace records[];
1086}; 994};
1087 995
1088static struct ftrace_page *ftrace_new_pgs; 996#define ENTRIES_PER_PAGE \
1089 997 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
1090#define ENTRY_SIZE sizeof(struct dyn_ftrace)
1091#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1092 998
1093/* estimate from running different kernels */ 999/* estimate from running different kernels */
1094#define NR_TO_INIT 10000 1000#define NR_TO_INIT 10000
@@ -1096,10 +1002,7 @@ static struct ftrace_page *ftrace_new_pgs;
1096static struct ftrace_page *ftrace_pages_start; 1002static struct ftrace_page *ftrace_pages_start;
1097static struct ftrace_page *ftrace_pages; 1003static struct ftrace_page *ftrace_pages;
1098 1004
1099static bool ftrace_hash_empty(struct ftrace_hash *hash) 1005static struct dyn_ftrace *ftrace_free_records;
1100{
1101 return !hash || !hash->count;
1102}
1103 1006
1104static struct ftrace_func_entry * 1007static struct ftrace_func_entry *
1105ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) 1008ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1109,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1109 struct hlist_head *hhd; 1012 struct hlist_head *hhd;
1110 struct hlist_node *n; 1013 struct hlist_node *n;
1111 1014
1112 if (ftrace_hash_empty(hash)) 1015 if (!hash->count)
1113 return NULL; 1016 return NULL;
1114 1017
1115 if (hash->size_bits > 0) 1018 if (hash->size_bits > 0)
@@ -1216,12 +1119,6 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1216 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); 1119 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1217} 1120}
1218 1121
1219void ftrace_free_filter(struct ftrace_ops *ops)
1220{
1221 free_ftrace_hash(ops->filter_hash);
1222 free_ftrace_hash(ops->notrace_hash);
1223}
1224
1225static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1122static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1226{ 1123{
1227 struct ftrace_hash *hash; 1124 struct ftrace_hash *hash;
@@ -1232,7 +1129,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1232 return NULL; 1129 return NULL;
1233 1130
1234 size = 1 << size_bits; 1131 size = 1 << size_bits;
1235 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); 1132 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1236 1133
1237 if (!hash->buckets) { 1134 if (!hash->buckets) {
1238 kfree(hash); 1135 kfree(hash);
@@ -1259,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1259 return NULL; 1156 return NULL;
1260 1157
1261 /* Empty hash? */ 1158 /* Empty hash? */
1262 if (ftrace_hash_empty(hash)) 1159 if (!hash || !hash->count)
1263 return new_hash; 1160 return new_hash;
1264 1161
1265 size = 1 << hash->size_bits; 1162 size = 1 << hash->size_bits;
@@ -1313,9 +1210,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1313 if (!src->count) { 1210 if (!src->count) {
1314 free_ftrace_hash_rcu(*dst); 1211 free_ftrace_hash_rcu(*dst);
1315 rcu_assign_pointer(*dst, EMPTY_HASH); 1212 rcu_assign_pointer(*dst, EMPTY_HASH);
1316 /* still need to update the function records */ 1213 return 0;
1317 ret = 0;
1318 goto out;
1319 } 1214 }
1320 1215
1321 /* 1216 /*
@@ -1384,9 +1279,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1384 filter_hash = rcu_dereference_raw(ops->filter_hash); 1279 filter_hash = rcu_dereference_raw(ops->filter_hash);
1385 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1280 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1386 1281
1387 if ((ftrace_hash_empty(filter_hash) || 1282 if ((!filter_hash || !filter_hash->count ||
1388 ftrace_lookup_ip(filter_hash, ip)) && 1283 ftrace_lookup_ip(filter_hash, ip)) &&
1389 (ftrace_hash_empty(notrace_hash) || 1284 (!notrace_hash || !notrace_hash->count ||
1390 !ftrace_lookup_ip(notrace_hash, ip))) 1285 !ftrace_lookup_ip(notrace_hash, ip)))
1391 ret = 1; 1286 ret = 1;
1392 else 1287 else
@@ -1409,76 +1304,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1409 } \ 1304 } \
1410 } 1305 }
1411 1306
1412
1413static int ftrace_cmp_recs(const void *a, const void *b)
1414{
1415 const struct dyn_ftrace *key = a;
1416 const struct dyn_ftrace *rec = b;
1417
1418 if (key->flags < rec->ip)
1419 return -1;
1420 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1421 return 1;
1422 return 0;
1423}
1424
1425static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1426{
1427 struct ftrace_page *pg;
1428 struct dyn_ftrace *rec;
1429 struct dyn_ftrace key;
1430
1431 key.ip = start;
1432 key.flags = end; /* overload flags, as it is unsigned long */
1433
1434 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1435 if (end < pg->records[0].ip ||
1436 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1437 continue;
1438 rec = bsearch(&key, pg->records, pg->index,
1439 sizeof(struct dyn_ftrace),
1440 ftrace_cmp_recs);
1441 if (rec)
1442 return rec->ip;
1443 }
1444
1445 return 0;
1446}
1447
1448/**
1449 * ftrace_location - return true if the ip giving is a traced location
1450 * @ip: the instruction pointer to check
1451 *
1452 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1453 * That is, the instruction that is either a NOP or call to
1454 * the function tracer. It checks the ftrace internal tables to
1455 * determine if the address belongs or not.
1456 */
1457unsigned long ftrace_location(unsigned long ip)
1458{
1459 return ftrace_location_range(ip, ip);
1460}
1461
1462/**
1463 * ftrace_text_reserved - return true if range contains an ftrace location
1464 * @start: start of range to search
1465 * @end: end of range to search (inclusive). @end points to the last byte to check.
1466 *
1467 * Returns 1 if @start and @end contains a ftrace location.
1468 * That is, the instruction that is either a NOP or call to
1469 * the function tracer. It checks the ftrace internal tables to
1470 * determine if the address belongs or not.
1471 */
1472int ftrace_text_reserved(void *start, void *end)
1473{
1474 unsigned long ret;
1475
1476 ret = ftrace_location_range((unsigned long)start,
1477 (unsigned long)end);
1478
1479 return (int)!!ret;
1480}
1481
1482static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1307static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1483 int filter_hash, 1308 int filter_hash,
1484 bool inc) 1309 bool inc)
@@ -1508,7 +1333,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1508 if (filter_hash) { 1333 if (filter_hash) {
1509 hash = ops->filter_hash; 1334 hash = ops->filter_hash;
1510 other_hash = ops->notrace_hash; 1335 other_hash = ops->notrace_hash;
1511 if (ftrace_hash_empty(hash)) 1336 if (!hash || !hash->count)
1512 all = 1; 1337 all = 1;
1513 } else { 1338 } else {
1514 inc = !inc; 1339 inc = !inc;
@@ -1518,7 +1343,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1518 * If the notrace hash has no items, 1343 * If the notrace hash has no items,
1519 * then there's nothing to do. 1344 * then there's nothing to do.
1520 */ 1345 */
1521 if (ftrace_hash_empty(hash)) 1346 if (hash && !hash->count)
1522 return; 1347 return;
1523 } 1348 }
1524 1349
@@ -1535,8 +1360,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1535 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) 1360 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1536 match = 1; 1361 match = 1;
1537 } else { 1362 } else {
1538 in_hash = !!ftrace_lookup_ip(hash, rec->ip); 1363 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1539 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); 1364 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1540 1365
1541 /* 1366 /*
1542 * 1367 *
@@ -1544,7 +1369,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1544 if (filter_hash && in_hash && !in_other_hash) 1369 if (filter_hash && in_hash && !in_other_hash)
1545 match = 1; 1370 match = 1;
1546 else if (!filter_hash && in_hash && 1371 else if (!filter_hash && in_hash &&
1547 (in_other_hash || ftrace_hash_empty(other_hash))) 1372 (in_other_hash || !other_hash->count))
1548 match = 1; 1373 match = 1;
1549 } 1374 }
1550 if (!match) 1375 if (!match)
@@ -1554,12 +1379,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1554 rec->flags++; 1379 rec->flags++;
1555 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) 1380 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1556 return; 1381 return;
1557 /*
1558 * If any ops wants regs saved for this function
1559 * then all ops will get saved regs.
1560 */
1561 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
1562 rec->flags |= FTRACE_FL_REGS;
1563 } else { 1382 } else {
1564 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) 1383 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1565 return; 1384 return;
@@ -1584,6 +1403,65 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1584 __ftrace_hash_rec_update(ops, filter_hash, 1); 1403 __ftrace_hash_rec_update(ops, filter_hash, 1);
1585} 1404}
1586 1405
1406static void ftrace_free_rec(struct dyn_ftrace *rec)
1407{
1408 rec->freelist = ftrace_free_records;
1409 ftrace_free_records = rec;
1410 rec->flags |= FTRACE_FL_FREE;
1411}
1412
1413static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1414{
1415 struct dyn_ftrace *rec;
1416
1417 /* First check for freed records */
1418 if (ftrace_free_records) {
1419 rec = ftrace_free_records;
1420
1421 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
1422 FTRACE_WARN_ON_ONCE(1);
1423 ftrace_free_records = NULL;
1424 return NULL;
1425 }
1426
1427 ftrace_free_records = rec->freelist;
1428 memset(rec, 0, sizeof(*rec));
1429 return rec;
1430 }
1431
1432 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
1433 if (!ftrace_pages->next) {
1434 /* allocate another page */
1435 ftrace_pages->next =
1436 (void *)get_zeroed_page(GFP_KERNEL);
1437 if (!ftrace_pages->next)
1438 return NULL;
1439 }
1440 ftrace_pages = ftrace_pages->next;
1441 }
1442
1443 return &ftrace_pages->records[ftrace_pages->index++];
1444}
1445
1446static struct dyn_ftrace *
1447ftrace_record_ip(unsigned long ip)
1448{
1449 struct dyn_ftrace *rec;
1450
1451 if (ftrace_disabled)
1452 return NULL;
1453
1454 rec = ftrace_alloc_dyn_node(ip);
1455 if (!rec)
1456 return NULL;
1457
1458 rec->ip = ip;
1459 rec->newlist = ftrace_new_addrs;
1460 ftrace_new_addrs = rec;
1461
1462 return rec;
1463}
1464
1587static void print_ip_ins(const char *fmt, unsigned char *p) 1465static void print_ip_ins(const char *fmt, unsigned char *p)
1588{ 1466{
1589 int i; 1467 int i;
@@ -1594,19 +1472,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1594 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1472 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1595} 1473}
1596 1474
1597/** 1475static void ftrace_bug(int failed, unsigned long ip)
1598 * ftrace_bug - report and shutdown function tracer
1599 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1600 * @ip: The address that failed
1601 *
1602 * The arch code that enables or disables the function tracing
1603 * can call ftrace_bug() when it has detected a problem in
1604 * modifying the code. @failed should be one of either:
1605 * EFAULT - if the problem happens on reading the @ip address
1606 * EINVAL - if what is read at @ip is not what was expected
1607 * EPERM - if the problem happens on writting to the @ip address
1608 */
1609void ftrace_bug(int failed, unsigned long ip)
1610{ 1476{
1611 switch (failed) { 1477 switch (failed) {
1612 case -EFAULT: 1478 case -EFAULT:
@@ -1633,10 +1499,30 @@ void ftrace_bug(int failed, unsigned long ip)
1633 } 1499 }
1634} 1500}
1635 1501
1636static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1502
1503/* Return 1 if the address range is reserved for ftrace */
1504int ftrace_text_reserved(void *start, void *end)
1505{
1506 struct dyn_ftrace *rec;
1507 struct ftrace_page *pg;
1508
1509 do_for_each_ftrace_rec(pg, rec) {
1510 if (rec->ip <= (unsigned long)end &&
1511 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1512 return 1;
1513 } while_for_each_ftrace_rec();
1514 return 0;
1515}
1516
1517
1518static int
1519__ftrace_replace_code(struct dyn_ftrace *rec, int update)
1637{ 1520{
1521 unsigned long ftrace_addr;
1638 unsigned long flag = 0UL; 1522 unsigned long flag = 0UL;
1639 1523
1524 ftrace_addr = (unsigned long)FTRACE_ADDR;
1525
1640 /* 1526 /*
1641 * If we are updating calls: 1527 * If we are updating calls:
1642 * 1528 *
@@ -1648,131 +1534,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1648 * If we are disabling calls, then disable all records that 1534 * If we are disabling calls, then disable all records that
1649 * are enabled. 1535 * are enabled.
1650 */ 1536 */
1651 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1537 if (update && (rec->flags & ~FTRACE_FL_MASK))
1652 flag = FTRACE_FL_ENABLED; 1538 flag = FTRACE_FL_ENABLED;
1653 1539
1654 /*
1655 * If enabling and the REGS flag does not match the REGS_EN, then
1656 * do not ignore this record. Set flags to fail the compare against
1657 * ENABLED.
1658 */
1659 if (flag &&
1660 (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
1661 flag |= FTRACE_FL_REGS;
1662
1663 /* If the state of this record hasn't changed, then do nothing */ 1540 /* If the state of this record hasn't changed, then do nothing */
1664 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1541 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1665 return FTRACE_UPDATE_IGNORE;
1666
1667 if (flag) {
1668 /* Save off if rec is being enabled (for return value) */
1669 flag ^= rec->flags & FTRACE_FL_ENABLED;
1670
1671 if (update) {
1672 rec->flags |= FTRACE_FL_ENABLED;
1673 if (flag & FTRACE_FL_REGS) {
1674 if (rec->flags & FTRACE_FL_REGS)
1675 rec->flags |= FTRACE_FL_REGS_EN;
1676 else
1677 rec->flags &= ~FTRACE_FL_REGS_EN;
1678 }
1679 }
1680
1681 /*
1682 * If this record is being updated from a nop, then
1683 * return UPDATE_MAKE_CALL.
1684 * Otherwise, if the EN flag is set, then return
1685 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1686 * from the non-save regs, to a save regs function.
1687 * Otherwise,
1688 * return UPDATE_MODIFY_CALL to tell the caller to convert
1689 * from the save regs, to a non-save regs function.
1690 */
1691 if (flag & FTRACE_FL_ENABLED)
1692 return FTRACE_UPDATE_MAKE_CALL;
1693 else if (rec->flags & FTRACE_FL_REGS_EN)
1694 return FTRACE_UPDATE_MODIFY_CALL_REGS;
1695 else
1696 return FTRACE_UPDATE_MODIFY_CALL;
1697 }
1698
1699 if (update) {
1700 /* If there's no more users, clear all flags */
1701 if (!(rec->flags & ~FTRACE_FL_MASK))
1702 rec->flags = 0;
1703 else
1704 /* Just disable the record (keep REGS state) */
1705 rec->flags &= ~FTRACE_FL_ENABLED;
1706 }
1707
1708 return FTRACE_UPDATE_MAKE_NOP;
1709}
1710
1711/**
1712 * ftrace_update_record, set a record that now is tracing or not
1713 * @rec: the record to update
1714 * @enable: set to 1 if the record is tracing, zero to force disable
1715 *
1716 * The records that represent all functions that can be traced need
1717 * to be updated when tracing has been enabled.
1718 */
1719int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1720{
1721 return ftrace_check_record(rec, enable, 1);
1722}
1723
1724/**
1725 * ftrace_test_record, check if the record has been enabled or not
1726 * @rec: the record to test
1727 * @enable: set to 1 to check if enabled, 0 if it is disabled
1728 *
1729 * The arch code may need to test if a record is already set to
1730 * tracing to determine how to modify the function code that it
1731 * represents.
1732 */
1733int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1734{
1735 return ftrace_check_record(rec, enable, 0);
1736}
1737
1738static int
1739__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1740{
1741 unsigned long ftrace_old_addr;
1742 unsigned long ftrace_addr;
1743 int ret;
1744
1745 ret = ftrace_update_record(rec, enable);
1746
1747 if (rec->flags & FTRACE_FL_REGS)
1748 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
1749 else
1750 ftrace_addr = (unsigned long)FTRACE_ADDR;
1751
1752 switch (ret) {
1753 case FTRACE_UPDATE_IGNORE:
1754 return 0; 1542 return 0;
1755 1543
1756 case FTRACE_UPDATE_MAKE_CALL: 1544 if (flag) {
1545 rec->flags |= FTRACE_FL_ENABLED;
1757 return ftrace_make_call(rec, ftrace_addr); 1546 return ftrace_make_call(rec, ftrace_addr);
1758
1759 case FTRACE_UPDATE_MAKE_NOP:
1760 return ftrace_make_nop(NULL, rec, ftrace_addr);
1761
1762 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1763 case FTRACE_UPDATE_MODIFY_CALL:
1764 if (rec->flags & FTRACE_FL_REGS)
1765 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1766 else
1767 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1768
1769 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1770 } 1547 }
1771 1548
1772 return -1; /* unknow ftrace bug */ 1549 rec->flags &= ~FTRACE_FL_ENABLED;
1550 return ftrace_make_nop(NULL, rec, ftrace_addr);
1773} 1551}
1774 1552
1775void __weak ftrace_replace_code(int enable) 1553static void ftrace_replace_code(int update)
1776{ 1554{
1777 struct dyn_ftrace *rec; 1555 struct dyn_ftrace *rec;
1778 struct ftrace_page *pg; 1556 struct ftrace_page *pg;
@@ -1782,7 +1560,11 @@ void __weak ftrace_replace_code(int enable)
1782 return; 1560 return;
1783 1561
1784 do_for_each_ftrace_rec(pg, rec) { 1562 do_for_each_ftrace_rec(pg, rec) {
1785 failed = __ftrace_replace_code(rec, enable); 1563 /* Skip over free records */
1564 if (rec->flags & FTRACE_FL_FREE)
1565 continue;
1566
1567 failed = __ftrace_replace_code(rec, update);
1786 if (failed) { 1568 if (failed) {
1787 ftrace_bug(failed, rec->ip); 1569 ftrace_bug(failed, rec->ip);
1788 /* Stop processing */ 1570 /* Stop processing */
@@ -1791,78 +1573,6 @@ void __weak ftrace_replace_code(int enable)
1791 } while_for_each_ftrace_rec(); 1573 } while_for_each_ftrace_rec();
1792} 1574}
1793 1575
1794struct ftrace_rec_iter {
1795 struct ftrace_page *pg;
1796 int index;
1797};
1798
1799/**
1800 * ftrace_rec_iter_start, start up iterating over traced functions
1801 *
1802 * Returns an iterator handle that is used to iterate over all
1803 * the records that represent address locations where functions
1804 * are traced.
1805 *
1806 * May return NULL if no records are available.
1807 */
1808struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1809{
1810 /*
1811 * We only use a single iterator.
1812 * Protected by the ftrace_lock mutex.
1813 */
1814 static struct ftrace_rec_iter ftrace_rec_iter;
1815 struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1816
1817 iter->pg = ftrace_pages_start;
1818 iter->index = 0;
1819
1820 /* Could have empty pages */
1821 while (iter->pg && !iter->pg->index)
1822 iter->pg = iter->pg->next;
1823
1824 if (!iter->pg)
1825 return NULL;
1826
1827 return iter;
1828}
1829
1830/**
1831 * ftrace_rec_iter_next, get the next record to process.
1832 * @iter: The handle to the iterator.
1833 *
1834 * Returns the next iterator after the given iterator @iter.
1835 */
1836struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
1837{
1838 iter->index++;
1839
1840 if (iter->index >= iter->pg->index) {
1841 iter->pg = iter->pg->next;
1842 iter->index = 0;
1843
1844 /* Could have empty pages */
1845 while (iter->pg && !iter->pg->index)
1846 iter->pg = iter->pg->next;
1847 }
1848
1849 if (!iter->pg)
1850 return NULL;
1851
1852 return iter;
1853}
1854
1855/**
1856 * ftrace_rec_iter_record, get the record at the iterator location
1857 * @iter: The current iterator location
1858 *
1859 * Returns the record that the current @iter is at.
1860 */
1861struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
1862{
1863 return &iter->pg->records[iter->index];
1864}
1865
1866static int 1576static int
1867ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 1577ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1868{ 1578{
@@ -1900,55 +1610,44 @@ int __weak ftrace_arch_code_modify_post_process(void)
1900 return 0; 1610 return 0;
1901} 1611}
1902 1612
1903void ftrace_modify_all_code(int command) 1613static int __ftrace_modify_code(void *data)
1904{ 1614{
1905 if (command & FTRACE_UPDATE_CALLS) 1615 int *command = data;
1616
1617 /*
1618 * Do not call function tracer while we update the code.
1619 * We are in stop machine, no worrying about races.
1620 */
1621 function_trace_stop++;
1622
1623 if (*command & FTRACE_UPDATE_CALLS)
1906 ftrace_replace_code(1); 1624 ftrace_replace_code(1);
1907 else if (command & FTRACE_DISABLE_CALLS) 1625 else if (*command & FTRACE_DISABLE_CALLS)
1908 ftrace_replace_code(0); 1626 ftrace_replace_code(0);
1909 1627
1910 if (command & FTRACE_UPDATE_TRACE_FUNC) 1628 if (*command & FTRACE_UPDATE_TRACE_FUNC)
1911 ftrace_update_ftrace_func(ftrace_trace_function); 1629 ftrace_update_ftrace_func(ftrace_trace_function);
1912 1630
1913 if (command & FTRACE_START_FUNC_RET) 1631 if (*command & FTRACE_START_FUNC_RET)
1914 ftrace_enable_ftrace_graph_caller(); 1632 ftrace_enable_ftrace_graph_caller();
1915 else if (command & FTRACE_STOP_FUNC_RET) 1633 else if (*command & FTRACE_STOP_FUNC_RET)
1916 ftrace_disable_ftrace_graph_caller(); 1634 ftrace_disable_ftrace_graph_caller();
1917}
1918
1919static int __ftrace_modify_code(void *data)
1920{
1921 int *command = data;
1922 1635
1923 ftrace_modify_all_code(*command); 1636#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1637 /*
1638 * For archs that call ftrace_test_stop_func(), we must
1639 * wait till after we update all the function callers
1640 * before we update the callback. This keeps different
1641 * ops that record different functions from corrupting
1642 * each other.
1643 */
1644 __ftrace_trace_function = __ftrace_trace_function_delay;
1645#endif
1646 function_trace_stop--;
1924 1647
1925 return 0; 1648 return 0;
1926} 1649}
1927 1650
1928/**
1929 * ftrace_run_stop_machine, go back to the stop machine method
1930 * @command: The command to tell ftrace what to do
1931 *
1932 * If an arch needs to fall back to the stop machine method, the
1933 * it can call this function.
1934 */
1935void ftrace_run_stop_machine(int command)
1936{
1937 stop_machine(__ftrace_modify_code, &command, NULL);
1938}
1939
1940/**
1941 * arch_ftrace_update_code, modify the code to trace or not trace
1942 * @command: The command that needs to be done
1943 *
1944 * Archs can override this function if it does not need to
1945 * run stop_machine() to modify code.
1946 */
1947void __weak arch_ftrace_update_code(int command)
1948{
1949 ftrace_run_stop_machine(command);
1950}
1951
1952static void ftrace_run_update_code(int command) 1651static void ftrace_run_update_code(int command)
1953{ 1652{
1954 int ret; 1653 int ret;
@@ -1957,21 +1656,8 @@ static void ftrace_run_update_code(int command)
1957 FTRACE_WARN_ON(ret); 1656 FTRACE_WARN_ON(ret);
1958 if (ret) 1657 if (ret)
1959 return; 1658 return;
1960 /*
1961 * Do not call function tracer while we update the code.
1962 * We are in stop machine.
1963 */
1964 function_trace_stop++;
1965 1659
1966 /* 1660 stop_machine(__ftrace_modify_code, &command, NULL);
1967 * By default we use stop_machine() to modify the code.
1968 * But archs can do what ever they want as long as it
1969 * is safe. The stop_machine() is the safest, but also
1970 * produces the most overhead.
1971 */
1972 arch_ftrace_update_code(command);
1973
1974 function_trace_stop--;
1975 1661
1976 ret = ftrace_arch_code_modify_post_process(); 1662 ret = ftrace_arch_code_modify_post_process();
1977 FTRACE_WARN_ON(ret); 1663 FTRACE_WARN_ON(ret);
@@ -2098,16 +1784,14 @@ static int ops_traces_mod(struct ftrace_ops *ops)
2098 struct ftrace_hash *hash; 1784 struct ftrace_hash *hash;
2099 1785
2100 hash = ops->filter_hash; 1786 hash = ops->filter_hash;
2101 return ftrace_hash_empty(hash); 1787 return !!(!hash || !hash->count);
2102} 1788}
2103 1789
2104static int ftrace_update_code(struct module *mod) 1790static int ftrace_update_code(struct module *mod)
2105{ 1791{
2106 struct ftrace_page *pg;
2107 struct dyn_ftrace *p; 1792 struct dyn_ftrace *p;
2108 cycle_t start, stop; 1793 cycle_t start, stop;
2109 unsigned long ref = 0; 1794 unsigned long ref = 0;
2110 int i;
2111 1795
2112 /* 1796 /*
2113 * When adding a module, we need to check if tracers are 1797 * When adding a module, we need to check if tracers are
@@ -2129,44 +1813,46 @@ static int ftrace_update_code(struct module *mod)
2129 start = ftrace_now(raw_smp_processor_id()); 1813 start = ftrace_now(raw_smp_processor_id());
2130 ftrace_update_cnt = 0; 1814 ftrace_update_cnt = 0;
2131 1815
2132 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 1816 while (ftrace_new_addrs) {
2133 1817
2134 for (i = 0; i < pg->index; i++) { 1818 /* If something went wrong, bail without enabling anything */
2135 /* If something went wrong, bail without enabling anything */ 1819 if (unlikely(ftrace_disabled))
2136 if (unlikely(ftrace_disabled)) 1820 return -1;
2137 return -1;
2138 1821
2139 p = &pg->records[i]; 1822 p = ftrace_new_addrs;
2140 p->flags = ref; 1823 ftrace_new_addrs = p->newlist;
1824 p->flags = ref;
2141 1825
2142 /* 1826 /*
2143 * Do the initial record conversion from mcount jump 1827 * Do the initial record conversion from mcount jump
2144 * to the NOP instructions. 1828 * to the NOP instructions.
2145 */ 1829 */
2146 if (!ftrace_code_disable(mod, p)) 1830 if (!ftrace_code_disable(mod, p)) {
2147 break; 1831 ftrace_free_rec(p);
1832 /* Game over */
1833 break;
1834 }
2148 1835
2149 ftrace_update_cnt++; 1836 ftrace_update_cnt++;
2150 1837
2151 /* 1838 /*
2152 * If the tracing is enabled, go ahead and enable the record. 1839 * If the tracing is enabled, go ahead and enable the record.
2153 * 1840 *
2154 * The reason not to enable the record immediatelly is the 1841 * The reason not to enable the record immediatelly is the
2155 * inherent check of ftrace_make_nop/ftrace_make_call for 1842 * inherent check of ftrace_make_nop/ftrace_make_call for
2156 * correct previous instructions. Making first the NOP 1843 * correct previous instructions. Making first the NOP
2157 * conversion puts the module to the correct state, thus 1844 * conversion puts the module to the correct state, thus
2158 * passing the ftrace_make_call check. 1845 * passing the ftrace_make_call check.
2159 */ 1846 */
2160 if (ftrace_start_up && ref) { 1847 if (ftrace_start_up && ref) {
2161 int failed = __ftrace_replace_code(p, 1); 1848 int failed = __ftrace_replace_code(p, 1);
2162 if (failed) 1849 if (failed) {
2163 ftrace_bug(failed, p->ip); 1850 ftrace_bug(failed, p->ip);
1851 ftrace_free_rec(p);
2164 } 1852 }
2165 } 1853 }
2166 } 1854 }
2167 1855
2168 ftrace_new_pgs = NULL;
2169
2170 stop = ftrace_now(raw_smp_processor_id()); 1856 stop = ftrace_now(raw_smp_processor_id());
2171 ftrace_update_time = stop - start; 1857 ftrace_update_time = stop - start;
2172 ftrace_update_tot_cnt += ftrace_update_cnt; 1858 ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -2174,109 +1860,58 @@ static int ftrace_update_code(struct module *mod)
2174 return 0; 1860 return 0;
2175} 1861}
2176 1862
2177static int ftrace_allocate_records(struct ftrace_page *pg, int count) 1863static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2178{ 1864{
2179 int order; 1865 struct ftrace_page *pg;
2180 int cnt; 1866 int cnt;
1867 int i;
2181 1868
2182 if (WARN_ON(!count)) 1869 /* allocate a few pages */
2183 return -EINVAL; 1870 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
2184 1871 if (!ftrace_pages_start)
2185 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); 1872 return -1;
2186 1873
2187 /* 1874 /*
2188 * We want to fill as much as possible. No more than a page 1875 * Allocate a few more pages.
2189 * may be empty. 1876 *
1877 * TODO: have some parser search vmlinux before
1878 * final linking to find all calls to ftrace.
1879 * Then we can:
1880 * a) know how many pages to allocate.
1881 * and/or
1882 * b) set up the table then.
1883 *
1884 * The dynamic code is still necessary for
1885 * modules.
2190 */ 1886 */
2191 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2192 order--;
2193
2194 again:
2195 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
2196 1887
2197 if (!pg->records) { 1888 pg = ftrace_pages = ftrace_pages_start;
2198 /* if we can't allocate this size, try something smaller */
2199 if (!order)
2200 return -ENOMEM;
2201 order >>= 1;
2202 goto again;
2203 }
2204
2205 cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
2206 pg->size = cnt;
2207 1889
2208 if (cnt > count) 1890 cnt = num_to_init / ENTRIES_PER_PAGE;
2209 cnt = count; 1891 pr_info("ftrace: allocating %ld entries in %d pages\n",
2210 1892 num_to_init, cnt + 1);
2211 return cnt;
2212}
2213
2214static struct ftrace_page *
2215ftrace_allocate_pages(unsigned long num_to_init)
2216{
2217 struct ftrace_page *start_pg;
2218 struct ftrace_page *pg;
2219 int order;
2220 int cnt;
2221
2222 if (!num_to_init)
2223 return 0;
2224
2225 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2226 if (!pg)
2227 return NULL;
2228
2229 /*
2230 * Try to allocate as much as possible in one continues
2231 * location that fills in all of the space. We want to
2232 * waste as little space as possible.
2233 */
2234 for (;;) {
2235 cnt = ftrace_allocate_records(pg, num_to_init);
2236 if (cnt < 0)
2237 goto free_pages;
2238 1893
2239 num_to_init -= cnt; 1894 for (i = 0; i < cnt; i++) {
2240 if (!num_to_init) 1895 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
2241 break;
2242 1896
2243 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); 1897 /* If we fail, we'll try later anyway */
2244 if (!pg->next) 1898 if (!pg->next)
2245 goto free_pages; 1899 break;
2246 1900
2247 pg = pg->next; 1901 pg = pg->next;
2248 } 1902 }
2249 1903
2250 return start_pg;
2251
2252 free_pages:
2253 while (start_pg) {
2254 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2255 free_pages((unsigned long)pg->records, order);
2256 start_pg = pg->next;
2257 kfree(pg);
2258 pg = start_pg;
2259 }
2260 pr_info("ftrace: FAILED to allocate memory for functions\n");
2261 return NULL;
2262}
2263
2264static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2265{
2266 int cnt;
2267
2268 if (!num_to_init) {
2269 pr_info("ftrace: No functions to be traced?\n");
2270 return -1;
2271 }
2272
2273 cnt = num_to_init / ENTRIES_PER_PAGE;
2274 pr_info("ftrace: allocating %ld entries in %d pages\n",
2275 num_to_init, cnt + 1);
2276
2277 return 0; 1904 return 0;
2278} 1905}
2279 1906
1907enum {
1908 FTRACE_ITER_FILTER = (1 << 0),
1909 FTRACE_ITER_NOTRACE = (1 << 1),
1910 FTRACE_ITER_PRINTALL = (1 << 2),
1911 FTRACE_ITER_HASH = (1 << 3),
1912 FTRACE_ITER_ENABLED = (1 << 4),
1913};
1914
2280#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1915#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2281 1916
2282struct ftrace_iterator { 1917struct ftrace_iterator {
@@ -2341,9 +1976,6 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
2341 void *p = NULL; 1976 void *p = NULL;
2342 loff_t l; 1977 loff_t l;
2343 1978
2344 if (!(iter->flags & FTRACE_ITER_DO_HASH))
2345 return NULL;
2346
2347 if (iter->func_pos > *pos) 1979 if (iter->func_pos > *pos)
2348 return NULL; 1980 return NULL;
2349 1981
@@ -2387,7 +2019,7 @@ static void *
2387t_next(struct seq_file *m, void *v, loff_t *pos) 2019t_next(struct seq_file *m, void *v, loff_t *pos)
2388{ 2020{
2389 struct ftrace_iterator *iter = m->private; 2021 struct ftrace_iterator *iter = m->private;
2390 struct ftrace_ops *ops = iter->ops; 2022 struct ftrace_ops *ops = &global_ops;
2391 struct dyn_ftrace *rec = NULL; 2023 struct dyn_ftrace *rec = NULL;
2392 2024
2393 if (unlikely(ftrace_disabled)) 2025 if (unlikely(ftrace_disabled))
@@ -2411,7 +2043,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2411 } 2043 }
2412 } else { 2044 } else {
2413 rec = &iter->pg->records[iter->idx++]; 2045 rec = &iter->pg->records[iter->idx++];
2414 if (((iter->flags & FTRACE_ITER_FILTER) && 2046 if ((rec->flags & FTRACE_FL_FREE) ||
2047
2048 ((iter->flags & FTRACE_ITER_FILTER) &&
2415 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2049 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
2416 2050
2417 ((iter->flags & FTRACE_ITER_NOTRACE) && 2051 ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2437,13 +2071,13 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2437{ 2071{
2438 iter->pos = 0; 2072 iter->pos = 0;
2439 iter->func_pos = 0; 2073 iter->func_pos = 0;
2440 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); 2074 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
2441} 2075}
2442 2076
2443static void *t_start(struct seq_file *m, loff_t *pos) 2077static void *t_start(struct seq_file *m, loff_t *pos)
2444{ 2078{
2445 struct ftrace_iterator *iter = m->private; 2079 struct ftrace_iterator *iter = m->private;
2446 struct ftrace_ops *ops = iter->ops; 2080 struct ftrace_ops *ops = &global_ops;
2447 void *p = NULL; 2081 void *p = NULL;
2448 loff_t l; 2082 loff_t l;
2449 2083
@@ -2463,8 +2097,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2463 * off, we can short cut and just print out that all 2097 * off, we can short cut and just print out that all
2464 * functions are enabled. 2098 * functions are enabled.
2465 */ 2099 */
2466 if (iter->flags & FTRACE_ITER_FILTER && 2100 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
2467 ftrace_hash_empty(ops->filter_hash)) {
2468 if (*pos > 0) 2101 if (*pos > 0)
2469 return t_hash_start(m, pos); 2102 return t_hash_start(m, pos);
2470 iter->flags |= FTRACE_ITER_PRINTALL; 2103 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2489,8 +2122,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2489 break; 2122 break;
2490 } 2123 }
2491 2124
2492 if (!p) 2125 if (!p) {
2493 return t_hash_start(m, pos); 2126 if (iter->flags & FTRACE_ITER_FILTER)
2127 return t_hash_start(m, pos);
2128
2129 return NULL;
2130 }
2494 2131
2495 return iter; 2132 return iter;
2496} 2133}
@@ -2520,9 +2157,8 @@ static int t_show(struct seq_file *m, void *v)
2520 2157
2521 seq_printf(m, "%ps", (void *)rec->ip); 2158 seq_printf(m, "%ps", (void *)rec->ip);
2522 if (iter->flags & FTRACE_ITER_ENABLED) 2159 if (iter->flags & FTRACE_ITER_ENABLED)
2523 seq_printf(m, " (%ld)%s", 2160 seq_printf(m, " (%ld)",
2524 rec->flags & ~FTRACE_FL_MASK, 2161 rec->flags & ~FTRACE_FL_MASK);
2525 rec->flags & FTRACE_FL_REGS ? " R" : "");
2526 seq_printf(m, "\n"); 2162 seq_printf(m, "\n");
2527 2163
2528 return 0; 2164 return 0;
@@ -2539,35 +2175,55 @@ static int
2539ftrace_avail_open(struct inode *inode, struct file *file) 2175ftrace_avail_open(struct inode *inode, struct file *file)
2540{ 2176{
2541 struct ftrace_iterator *iter; 2177 struct ftrace_iterator *iter;
2178 int ret;
2542 2179
2543 if (unlikely(ftrace_disabled)) 2180 if (unlikely(ftrace_disabled))
2544 return -ENODEV; 2181 return -ENODEV;
2545 2182
2546 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 2183 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2547 if (iter) { 2184 if (!iter)
2548 iter->pg = ftrace_pages_start; 2185 return -ENOMEM;
2549 iter->ops = &global_ops; 2186
2187 iter->pg = ftrace_pages_start;
2188
2189 ret = seq_open(file, &show_ftrace_seq_ops);
2190 if (!ret) {
2191 struct seq_file *m = file->private_data;
2192
2193 m->private = iter;
2194 } else {
2195 kfree(iter);
2550 } 2196 }
2551 2197
2552 return iter ? 0 : -ENOMEM; 2198 return ret;
2553} 2199}
2554 2200
2555static int 2201static int
2556ftrace_enabled_open(struct inode *inode, struct file *file) 2202ftrace_enabled_open(struct inode *inode, struct file *file)
2557{ 2203{
2558 struct ftrace_iterator *iter; 2204 struct ftrace_iterator *iter;
2205 int ret;
2559 2206
2560 if (unlikely(ftrace_disabled)) 2207 if (unlikely(ftrace_disabled))
2561 return -ENODEV; 2208 return -ENODEV;
2562 2209
2563 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 2210 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2564 if (iter) { 2211 if (!iter)
2565 iter->pg = ftrace_pages_start; 2212 return -ENOMEM;
2566 iter->flags = FTRACE_ITER_ENABLED; 2213
2567 iter->ops = &global_ops; 2214 iter->pg = ftrace_pages_start;
2215 iter->flags = FTRACE_ITER_ENABLED;
2216
2217 ret = seq_open(file, &show_ftrace_seq_ops);
2218 if (!ret) {
2219 struct seq_file *m = file->private_data;
2220
2221 m->private = iter;
2222 } else {
2223 kfree(iter);
2568 } 2224 }
2569 2225
2570 return iter ? 0 : -ENOMEM; 2226 return ret;
2571} 2227}
2572 2228
2573static void ftrace_filter_reset(struct ftrace_hash *hash) 2229static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -2577,23 +2233,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2577 mutex_unlock(&ftrace_lock); 2233 mutex_unlock(&ftrace_lock);
2578} 2234}
2579 2235
2580/** 2236static int
2581 * ftrace_regex_open - initialize function tracer filter files
2582 * @ops: The ftrace_ops that hold the hash filters
2583 * @flag: The type of filter to process
2584 * @inode: The inode, usually passed in to your open routine
2585 * @file: The file, usually passed in to your open routine
2586 *
2587 * ftrace_regex_open() initializes the filter files for the
2588 * @ops. Depending on @flag it may process the filter hash or
2589 * the notrace hash of @ops. With this called from the open
2590 * routine, you can use ftrace_filter_write() for the write
2591 * routine if @flag has FTRACE_ITER_FILTER set, or
2592 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2593 * ftrace_regex_lseek() should be used as the lseek routine, and
2594 * release must call ftrace_regex_release().
2595 */
2596int
2597ftrace_regex_open(struct ftrace_ops *ops, int flag, 2237ftrace_regex_open(struct ftrace_ops *ops, int flag,
2598 struct inode *inode, struct file *file) 2238 struct inode *inode, struct file *file)
2599{ 2239{
@@ -2662,9 +2302,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2662static int 2302static int
2663ftrace_filter_open(struct inode *inode, struct file *file) 2303ftrace_filter_open(struct inode *inode, struct file *file)
2664{ 2304{
2665 return ftrace_regex_open(&global_ops, 2305 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2666 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 2306 inode, file);
2667 inode, file);
2668} 2307}
2669 2308
2670static int 2309static int
@@ -2674,13 +2313,13 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2674 inode, file); 2313 inode, file);
2675} 2314}
2676 2315
2677loff_t 2316static loff_t
2678ftrace_regex_lseek(struct file *file, loff_t offset, int whence) 2317ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
2679{ 2318{
2680 loff_t ret; 2319 loff_t ret;
2681 2320
2682 if (file->f_mode & FMODE_READ) 2321 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, whence); 2322 ret = seq_lseek(file, offset, origin);
2684 else 2323 else
2685 file->f_pos = ret = 1; 2324 file->f_pos = ret = 1;
2686 2325
@@ -2783,6 +2422,7 @@ match_records(struct ftrace_hash *hash, char *buff,
2783 goto out_unlock; 2422 goto out_unlock;
2784 2423
2785 do_for_each_ftrace_rec(pg, rec) { 2424 do_for_each_ftrace_rec(pg, rec) {
2425
2786 if (ftrace_match_record(rec, mod, search, search_len, type)) { 2426 if (ftrace_match_record(rec, mod, search, search_len, type)) {
2787 ret = enter_record(hash, rec, not); 2427 ret = enter_record(hash, rec, not);
2788 if (ret < 0) { 2428 if (ret < 0) {
@@ -2868,10 +2508,10 @@ static int __init ftrace_mod_cmd_init(void)
2868{ 2508{
2869 return register_ftrace_command(&ftrace_mod_cmd); 2509 return register_ftrace_command(&ftrace_mod_cmd);
2870} 2510}
2871core_initcall(ftrace_mod_cmd_init); 2511device_initcall(ftrace_mod_cmd_init);
2872 2512
2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 2513static void
2874 struct ftrace_ops *op, struct pt_regs *pt_regs) 2514function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
2875{ 2515{
2876 struct ftrace_func_probe *entry; 2516 struct ftrace_func_probe *entry;
2877 struct hlist_head *hhd; 2517 struct hlist_head *hhd;
@@ -3227,14 +2867,14 @@ out_unlock:
3227 return ret; 2867 return ret;
3228} 2868}
3229 2869
3230ssize_t 2870static ssize_t
3231ftrace_filter_write(struct file *file, const char __user *ubuf, 2871ftrace_filter_write(struct file *file, const char __user *ubuf,
3232 size_t cnt, loff_t *ppos) 2872 size_t cnt, loff_t *ppos)
3233{ 2873{
3234 return ftrace_regex_write(file, ubuf, cnt, ppos, 1); 2874 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
3235} 2875}
3236 2876
3237ssize_t 2877static ssize_t
3238ftrace_notrace_write(struct file *file, const char __user *ubuf, 2878ftrace_notrace_write(struct file *file, const char __user *ubuf,
3239 size_t cnt, loff_t *ppos) 2879 size_t cnt, loff_t *ppos)
3240{ 2880{
@@ -3242,27 +2882,8 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
3242} 2882}
3243 2883
3244static int 2884static int
3245ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) 2885ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3246{ 2886 int reset, int enable)
3247 struct ftrace_func_entry *entry;
3248
3249 if (!ftrace_location(ip))
3250 return -EINVAL;
3251
3252 if (remove) {
3253 entry = ftrace_lookup_ip(hash, ip);
3254 if (!entry)
3255 return -ENOENT;
3256 free_hash_entry(hash, entry);
3257 return 0;
3258 }
3259
3260 return add_hash_entry(hash, ip);
3261}
3262
3263static int
3264ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3265 unsigned long ip, int remove, int reset, int enable)
3266{ 2887{
3267 struct ftrace_hash **orig_hash; 2888 struct ftrace_hash **orig_hash;
3268 struct ftrace_hash *hash; 2889 struct ftrace_hash *hash;
@@ -3287,15 +2908,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3287 mutex_lock(&ftrace_regex_lock); 2908 mutex_lock(&ftrace_regex_lock);
3288 if (reset) 2909 if (reset)
3289 ftrace_filter_reset(hash); 2910 ftrace_filter_reset(hash);
3290 if (buf && !ftrace_match_records(hash, buf, len)) { 2911 if (buf)
3291 ret = -EINVAL; 2912 ftrace_match_records(hash, buf, len);
3292 goto out_regex_unlock;
3293 }
3294 if (ip) {
3295 ret = ftrace_match_addr(hash, ip, remove);
3296 if (ret < 0)
3297 goto out_regex_unlock;
3298 }
3299 2913
3300 mutex_lock(&ftrace_lock); 2914 mutex_lock(&ftrace_lock);
3301 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 2915 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3305,44 +2919,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3305 2919
3306 mutex_unlock(&ftrace_lock); 2920 mutex_unlock(&ftrace_lock);
3307 2921
3308 out_regex_unlock:
3309 mutex_unlock(&ftrace_regex_lock); 2922 mutex_unlock(&ftrace_regex_lock);
3310 2923
3311 free_ftrace_hash(hash); 2924 free_ftrace_hash(hash);
3312 return ret; 2925 return ret;
3313} 2926}
3314 2927
3315static int
3316ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
3317 int reset, int enable)
3318{
3319 return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
3320}
3321
3322/**
3323 * ftrace_set_filter_ip - set a function to filter on in ftrace by address
3324 * @ops - the ops to set the filter with
3325 * @ip - the address to add to or remove from the filter.
3326 * @remove - non zero to remove the ip from the filter
3327 * @reset - non zero to reset all filters before applying this filter.
3328 *
3329 * Filters denote which functions should be enabled when tracing is enabled
3330 * If @ip is NULL, it failes to update filter.
3331 */
3332int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
3333 int remove, int reset)
3334{
3335 return ftrace_set_addr(ops, ip, remove, reset, 1);
3336}
3337EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
3338
3339static int
3340ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3341 int reset, int enable)
3342{
3343 return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
3344}
3345
3346/** 2928/**
3347 * ftrace_set_filter - set a function to filter on in ftrace 2929 * ftrace_set_filter - set a function to filter on in ftrace
3348 * @ops - the ops to set the filter with 2930 * @ops - the ops to set the filter with
@@ -3353,10 +2935,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3353 * Filters denote which functions should be enabled when tracing is enabled. 2935 * Filters denote which functions should be enabled when tracing is enabled.
3354 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2936 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3355 */ 2937 */
3356int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 2938void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3357 int len, int reset) 2939 int len, int reset)
3358{ 2940{
3359 return ftrace_set_regex(ops, buf, len, reset, 1); 2941 ftrace_set_regex(ops, buf, len, reset, 1);
3360} 2942}
3361EXPORT_SYMBOL_GPL(ftrace_set_filter); 2943EXPORT_SYMBOL_GPL(ftrace_set_filter);
3362 2944
@@ -3371,10 +2953,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
3371 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2953 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3372 * for tracing. 2954 * for tracing.
3373 */ 2955 */
3374int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 2956void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3375 int len, int reset) 2957 int len, int reset)
3376{ 2958{
3377 return ftrace_set_regex(ops, buf, len, reset, 0); 2959 ftrace_set_regex(ops, buf, len, reset, 0);
3378} 2960}
3379EXPORT_SYMBOL_GPL(ftrace_set_notrace); 2961EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3380/** 2962/**
@@ -3459,8 +3041,8 @@ static void __init set_ftrace_early_graph(char *buf)
3459} 3041}
3460#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3042#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3461 3043
3462void __init 3044static void __init
3463ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) 3045set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3464{ 3046{
3465 char *func; 3047 char *func;
3466 3048
@@ -3473,16 +3055,17 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3473static void __init set_ftrace_early_filters(void) 3055static void __init set_ftrace_early_filters(void)
3474{ 3056{
3475 if (ftrace_filter_buf[0]) 3057 if (ftrace_filter_buf[0])
3476 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); 3058 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
3477 if (ftrace_notrace_buf[0]) 3059 if (ftrace_notrace_buf[0])
3478 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); 3060 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
3479#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3061#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3480 if (ftrace_graph_buf[0]) 3062 if (ftrace_graph_buf[0])
3481 set_ftrace_early_graph(ftrace_graph_buf); 3063 set_ftrace_early_graph(ftrace_graph_buf);
3482#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3064#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3483} 3065}
3484 3066
3485int ftrace_regex_release(struct inode *inode, struct file *file) 3067static int
3068ftrace_regex_release(struct inode *inode, struct file *file)
3486{ 3069{
3487 struct seq_file *m = (struct seq_file *)file->private_data; 3070 struct seq_file *m = (struct seq_file *)file->private_data;
3488 struct ftrace_iterator *iter; 3071 struct ftrace_iterator *iter;
@@ -3683,6 +3266,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3683 3266
3684 do_for_each_ftrace_rec(pg, rec) { 3267 do_for_each_ftrace_rec(pg, rec) {
3685 3268
3269 if (rec->flags & FTRACE_FL_FREE)
3270 continue;
3271
3686 if (ftrace_match_record(rec, NULL, search, search_len, type)) { 3272 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3687 /* if it is in the array */ 3273 /* if it is in the array */
3688 exists = false; 3274 exists = false;
@@ -3791,80 +3377,16 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3791 return 0; 3377 return 0;
3792} 3378}
3793 3379
3794static int ftrace_cmp_ips(const void *a, const void *b)
3795{
3796 const unsigned long *ipa = a;
3797 const unsigned long *ipb = b;
3798
3799 if (*ipa > *ipb)
3800 return 1;
3801 if (*ipa < *ipb)
3802 return -1;
3803 return 0;
3804}
3805
3806static void ftrace_swap_ips(void *a, void *b, int size)
3807{
3808 unsigned long *ipa = a;
3809 unsigned long *ipb = b;
3810 unsigned long t;
3811
3812 t = *ipa;
3813 *ipa = *ipb;
3814 *ipb = t;
3815}
3816
3817static int ftrace_process_locs(struct module *mod, 3380static int ftrace_process_locs(struct module *mod,
3818 unsigned long *start, 3381 unsigned long *start,
3819 unsigned long *end) 3382 unsigned long *end)
3820{ 3383{
3821 struct ftrace_page *start_pg;
3822 struct ftrace_page *pg;
3823 struct dyn_ftrace *rec;
3824 unsigned long count;
3825 unsigned long *p; 3384 unsigned long *p;
3826 unsigned long addr; 3385 unsigned long addr;
3827 unsigned long flags = 0; /* Shut up gcc */ 3386 unsigned long flags = 0; /* Shut up gcc */
3828 int ret = -ENOMEM;
3829
3830 count = end - start;
3831
3832 if (!count)
3833 return 0;
3834
3835 sort(start, count, sizeof(*start),
3836 ftrace_cmp_ips, ftrace_swap_ips);
3837
3838 start_pg = ftrace_allocate_pages(count);
3839 if (!start_pg)
3840 return -ENOMEM;
3841 3387
3842 mutex_lock(&ftrace_lock); 3388 mutex_lock(&ftrace_lock);
3843
3844 /*
3845 * Core and each module needs their own pages, as
3846 * modules will free them when they are removed.
3847 * Force a new page to be allocated for modules.
3848 */
3849 if (!mod) {
3850 WARN_ON(ftrace_pages || ftrace_pages_start);
3851 /* First initialization */
3852 ftrace_pages = ftrace_pages_start = start_pg;
3853 } else {
3854 if (!ftrace_pages)
3855 goto out;
3856
3857 if (WARN_ON(ftrace_pages->next)) {
3858 /* Hmm, we have free pages? */
3859 while (ftrace_pages->next)
3860 ftrace_pages = ftrace_pages->next;
3861 }
3862
3863 ftrace_pages->next = start_pg;
3864 }
3865
3866 p = start; 3389 p = start;
3867 pg = start_pg;
3868 while (p < end) { 3390 while (p < end) {
3869 addr = ftrace_call_adjust(*p++); 3391 addr = ftrace_call_adjust(*p++);
3870 /* 3392 /*
@@ -3875,27 +3397,9 @@ static int ftrace_process_locs(struct module *mod,
3875 */ 3397 */
3876 if (!addr) 3398 if (!addr)
3877 continue; 3399 continue;
3878 3400 ftrace_record_ip(addr);
3879 if (pg->index == pg->size) {
3880 /* We should have allocated enough */
3881 if (WARN_ON(!pg->next))
3882 break;
3883 pg = pg->next;
3884 }
3885
3886 rec = &pg->records[pg->index++];
3887 rec->ip = addr;
3888 } 3401 }
3889 3402
3890 /* We should have used all pages */
3891 WARN_ON(pg->next);
3892
3893 /* Assign the last page to ftrace_pages */
3894 ftrace_pages = pg;
3895
3896 /* These new locations need to be initialized */
3897 ftrace_new_pgs = start_pg;
3898
3899 /* 3403 /*
3900 * We only need to disable interrupts on start up 3404 * We only need to disable interrupts on start up
3901 * because we are modifying code that an interrupt 3405 * because we are modifying code that an interrupt
@@ -3909,55 +3413,32 @@ static int ftrace_process_locs(struct module *mod,
3909 ftrace_update_code(mod); 3413 ftrace_update_code(mod);
3910 if (!mod) 3414 if (!mod)
3911 local_irq_restore(flags); 3415 local_irq_restore(flags);
3912 ret = 0;
3913 out:
3914 mutex_unlock(&ftrace_lock); 3416 mutex_unlock(&ftrace_lock);
3915 3417
3916 return ret; 3418 return 0;
3917} 3419}
3918 3420
3919#ifdef CONFIG_MODULES 3421#ifdef CONFIG_MODULES
3920
3921#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3922
3923void ftrace_release_mod(struct module *mod) 3422void ftrace_release_mod(struct module *mod)
3924{ 3423{
3925 struct dyn_ftrace *rec; 3424 struct dyn_ftrace *rec;
3926 struct ftrace_page **last_pg;
3927 struct ftrace_page *pg; 3425 struct ftrace_page *pg;
3928 int order;
3929 3426
3930 mutex_lock(&ftrace_lock); 3427 mutex_lock(&ftrace_lock);
3931 3428
3932 if (ftrace_disabled) 3429 if (ftrace_disabled)
3933 goto out_unlock; 3430 goto out_unlock;
3934 3431
3935 /* 3432 do_for_each_ftrace_rec(pg, rec) {
3936 * Each module has its own ftrace_pages, remove
3937 * them from the list.
3938 */
3939 last_pg = &ftrace_pages_start;
3940 for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3941 rec = &pg->records[0];
3942 if (within_module_core(rec->ip, mod)) { 3433 if (within_module_core(rec->ip, mod)) {
3943 /* 3434 /*
3944 * As core pages are first, the first 3435 * rec->ip is changed in ftrace_free_rec()
3945 * page should never be a module page. 3436 * It should not between s and e if record was freed.
3946 */ 3437 */
3947 if (WARN_ON(pg == ftrace_pages_start)) 3438 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
3948 goto out_unlock; 3439 ftrace_free_rec(rec);
3949 3440 }
3950 /* Check if we are deleting the last page */ 3441 } while_for_each_ftrace_rec();
3951 if (pg == ftrace_pages)
3952 ftrace_pages = next_to_ftrace_page(last_pg);
3953
3954 *last_pg = pg->next;
3955 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3956 free_pages((unsigned long)pg->records, order);
3957 kfree(pg);
3958 } else
3959 last_pg = &pg->next;
3960 }
3961 out_unlock: 3442 out_unlock:
3962 mutex_unlock(&ftrace_lock); 3443 mutex_unlock(&ftrace_lock);
3963} 3444}
@@ -4047,7 +3528,6 @@ void __init ftrace_init(void)
4047 3528
4048static struct ftrace_ops global_ops = { 3529static struct ftrace_ops global_ops = {
4049 .func = ftrace_stub, 3530 .func = ftrace_stub,
4050 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
4051}; 3531};
4052 3532
4053static int __init ftrace_nodyn_init(void) 3533static int __init ftrace_nodyn_init(void)
@@ -4055,7 +3535,7 @@ static int __init ftrace_nodyn_init(void)
4055 ftrace_enabled = 1; 3535 ftrace_enabled = 1;
4056 return 0; 3536 return 0;
4057} 3537}
4058core_initcall(ftrace_nodyn_init); 3538device_initcall(ftrace_nodyn_init);
4059 3539
4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3540static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061static inline void ftrace_startup_enable(int command) { } 3541static inline void ftrace_startup_enable(int command) { }
@@ -4078,44 +3558,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
4078#endif /* CONFIG_DYNAMIC_FTRACE */ 3558#endif /* CONFIG_DYNAMIC_FTRACE */
4079 3559
4080static void 3560static void
4081ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, 3561ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
4082 struct ftrace_ops *op, struct pt_regs *regs)
4083{
4084 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
4085 return;
4086
4087 /*
4088 * Some of the ops may be dynamically allocated,
4089 * they must be freed after a synchronize_sched().
4090 */
4091 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list);
4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs);
4098
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace();
4103}
4104
4105static struct ftrace_ops control_ops = {
4106 .func = ftrace_ops_control_func,
4107 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
4108};
4109
4110static inline void
4111__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 3562{
4114 struct ftrace_ops *op; 3563 struct ftrace_ops *op;
4115 3564
4116 if (function_trace_stop)
4117 return;
4118
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 3565 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
4120 return; 3566 return;
4121 3567
@@ -4128,39 +3574,13 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4128 op = rcu_dereference_raw(ftrace_ops_list); 3574 op = rcu_dereference_raw(ftrace_ops_list);
4129 while (op != &ftrace_list_end) { 3575 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 3576 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 3577 op->func(ip, parent_ip);
4132 op = rcu_dereference_raw(op->next); 3578 op = rcu_dereference_raw(op->next);
4133 }; 3579 };
4134 preempt_enable_notrace(); 3580 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 3581 trace_recursion_clear(TRACE_INTERNAL_BIT);
4136} 3582}
4137 3583
4138/*
4139 * Some archs only support passing ip and parent_ip. Even though
4140 * the list function ignores the op parameter, we do not want any
4141 * C side effects, where a function is called without the caller
4142 * sending a third parameter.
4143 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */
4151#if ARCH_SUPPORTS_FTRACE_OPS
4152static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4153 struct ftrace_ops *op, struct pt_regs *regs)
4154{
4155 __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
4156}
4157#else
4158static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4159{
4160 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
4161}
4162#endif
4163
4164static void clear_ftrace_swapper(void) 3584static void clear_ftrace_swapper(void)
4165{ 3585{
4166 struct task_struct *p; 3586 struct task_struct *p;
@@ -4381,7 +3801,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
4381 if (strlen(tmp) == 0) 3801 if (strlen(tmp) == 0)
4382 return 1; 3802 return 1;
4383 3803
4384 ret = kstrtol(tmp, 10, &val); 3804 ret = strict_strtol(tmp, 10, &val);
4385 if (ret < 0) 3805 if (ret < 0)
4386 return ret; 3806 return ret;
4387 3807
@@ -4441,14 +3861,6 @@ void ftrace_kill(void)
4441} 3861}
4442 3862
4443/** 3863/**
4444 * Test if ftrace is dead or not.
4445 */
4446int ftrace_is_dead(void)
4447{
4448 return ftrace_disabled;
4449}
4450
4451/**
4452 * register_ftrace_function - register a function for profiling 3864 * register_ftrace_function - register a function for profiling
4453 * @ops - ops structure that holds the function for profiling. 3865 * @ops - ops structure that holds the function for profiling.
4454 * 3866 *
@@ -4465,12 +3877,16 @@ int register_ftrace_function(struct ftrace_ops *ops)
4465 3877
4466 mutex_lock(&ftrace_lock); 3878 mutex_lock(&ftrace_lock);
4467 3879
3880 if (unlikely(ftrace_disabled))
3881 goto out_unlock;
3882
4468 ret = __register_ftrace_function(ops); 3883 ret = __register_ftrace_function(ops);
4469 if (!ret) 3884 if (!ret)
4470 ret = ftrace_startup(ops, 0); 3885 ret = ftrace_startup(ops, 0);
4471 3886
4472 mutex_unlock(&ftrace_lock);
4473 3887
3888 out_unlock:
3889 mutex_unlock(&ftrace_lock);
4474 return ret; 3890 return ret;
4475} 3891}
4476EXPORT_SYMBOL_GPL(register_ftrace_function); 3892EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedc..731201bf4ac 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,8 +23,6 @@
23#include <asm/local.h> 23#include <asm/local.h>
24#include "trace.h" 24#include "trace.h"
25 25
26static void update_pages_handler(struct work_struct *work);
27
28/* 26/*
29 * The ring buffer header is special. We must manually up keep it. 27 * The ring buffer header is special. We must manually up keep it.
30 */ 28 */
@@ -156,12 +154,35 @@ enum {
156 154
157static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; 155static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
158 156
159/* Used for individual buffers (after the counter) */
160#define RB_BUFFER_OFF (1 << 20)
161
162#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) 157#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
163 158
164/** 159/**
160 * tracing_on - enable all tracing buffers
161 *
162 * This function enables all tracing buffers that may have been
163 * disabled with tracing_off.
164 */
165void tracing_on(void)
166{
167 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
168}
169EXPORT_SYMBOL_GPL(tracing_on);
170
171/**
172 * tracing_off - turn off all tracing buffers
173 *
174 * This function stops all tracing buffers from recording data.
175 * It does not disable any overhead the tracers themselves may
176 * be causing. This function simply causes all recording to
177 * the ring buffers to fail.
178 */
179void tracing_off(void)
180{
181 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
182}
183EXPORT_SYMBOL_GPL(tracing_off);
184
185/**
165 * tracing_off_permanent - permanently disable ring buffers 186 * tracing_off_permanent - permanently disable ring buffers
166 * 187 *
167 * This function, once called, will disable all ring buffers 188 * This function, once called, will disable all ring buffers
@@ -172,6 +193,15 @@ void tracing_off_permanent(void)
172 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); 193 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
173} 194}
174 195
196/**
197 * tracing_is_on - show state of ring buffers enabled
198 */
199int tracing_is_on(void)
200{
201 return ring_buffer_flags == RB_BUFFERS_ON;
202}
203EXPORT_SYMBOL_GPL(tracing_is_on);
204
175#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 205#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
176#define RB_ALIGNMENT 4U 206#define RB_ALIGNMENT 4U
177#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -448,10 +478,9 @@ struct ring_buffer_per_cpu {
448 int cpu; 478 int cpu;
449 atomic_t record_disabled; 479 atomic_t record_disabled;
450 struct ring_buffer *buffer; 480 struct ring_buffer *buffer;
451 raw_spinlock_t reader_lock; /* serialize readers */ 481 spinlock_t reader_lock; /* serialize readers */
452 arch_spinlock_t lock; 482 arch_spinlock_t lock;
453 struct lock_class_key lock_key; 483 struct lock_class_key lock_key;
454 unsigned int nr_pages;
455 struct list_head *pages; 484 struct list_head *pages;
456 struct buffer_page *head_page; /* read from head */ 485 struct buffer_page *head_page; /* read from head */
457 struct buffer_page *tail_page; /* write to tail */ 486 struct buffer_page *tail_page; /* write to tail */
@@ -459,29 +488,21 @@ struct ring_buffer_per_cpu {
459 struct buffer_page *reader_page; 488 struct buffer_page *reader_page;
460 unsigned long lost_events; 489 unsigned long lost_events;
461 unsigned long last_overrun; 490 unsigned long last_overrun;
462 local_t entries_bytes;
463 local_t entries;
464 local_t overrun;
465 local_t commit_overrun; 491 local_t commit_overrun;
466 local_t dropped_events; 492 local_t overrun;
493 local_t entries;
467 local_t committing; 494 local_t committing;
468 local_t commits; 495 local_t commits;
469 unsigned long read; 496 unsigned long read;
470 unsigned long read_bytes;
471 u64 write_stamp; 497 u64 write_stamp;
472 u64 read_stamp; 498 u64 read_stamp;
473 /* ring buffer pages to update, > 0 to add, < 0 to remove */
474 int nr_pages_to_update;
475 struct list_head new_pages; /* new pages to add */
476 struct work_struct update_pages_work;
477 struct completion update_done;
478}; 499};
479 500
480struct ring_buffer { 501struct ring_buffer {
502 unsigned pages;
481 unsigned flags; 503 unsigned flags;
482 int cpus; 504 int cpus;
483 atomic_t record_disabled; 505 atomic_t record_disabled;
484 atomic_t resize_disabled;
485 cpumask_var_t cpumask; 506 cpumask_var_t cpumask;
486 507
487 struct lock_class_key *reader_lock_key; 508 struct lock_class_key *reader_lock_key;
@@ -946,10 +967,6 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
946 struct list_head *head = cpu_buffer->pages; 967 struct list_head *head = cpu_buffer->pages;
947 struct buffer_page *bpage, *tmp; 968 struct buffer_page *bpage, *tmp;
948 969
949 /* Reset the head page if it exists */
950 if (cpu_buffer->head_page)
951 rb_set_head_page(cpu_buffer);
952
953 rb_head_page_deactivate(cpu_buffer); 970 rb_head_page_deactivate(cpu_buffer);
954 971
955 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 972 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -976,10 +993,14 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
976 return 0; 993 return 0;
977} 994}
978 995
979static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) 996static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages)
980{ 998{
981 int i;
982 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 LIST_HEAD(pages);
1001 unsigned i;
1002
1003 WARN_ON(!nr_pages);
983 1004
984 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
985 struct page *page; 1006 struct page *page;
@@ -990,13 +1011,15 @@ static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
990 */ 1011 */
991 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
992 GFP_KERNEL | __GFP_NORETRY, 1013 GFP_KERNEL | __GFP_NORETRY,
993 cpu_to_node(cpu)); 1014 cpu_to_node(cpu_buffer->cpu));
994 if (!bpage) 1015 if (!bpage)
995 goto free_pages; 1016 goto free_pages;
996 1017
997 list_add(&bpage->list, pages); 1018 rb_check_bpage(cpu_buffer, bpage);
998 1019
999 page = alloc_pages_node(cpu_to_node(cpu), 1020 list_add(&bpage->list, &pages);
1021
1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1000 GFP_KERNEL | __GFP_NORETRY, 0); 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1001 if (!page) 1024 if (!page)
1002 goto free_pages; 1025 goto free_pages;
@@ -1004,27 +1027,6 @@ static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
1004 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1005 } 1028 }
1006 1029
1007 return 0;
1008
1009free_pages:
1010 list_for_each_entry_safe(bpage, tmp, pages, list) {
1011 list_del_init(&bpage->list);
1012 free_buffer_page(bpage);
1013 }
1014
1015 return -ENOMEM;
1016}
1017
1018static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1019 unsigned nr_pages)
1020{
1021 LIST_HEAD(pages);
1022
1023 WARN_ON(!nr_pages);
1024
1025 if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
1026 return -ENOMEM;
1027
1028 /* 1030 /*
1029 * The ring buffer page list is a circular list that does not 1031 * The ring buffer page list is a circular list that does not
1030 * start and end with a list head. All page list items point to 1032 * start and end with a list head. All page list items point to
@@ -1033,15 +1035,20 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1033 cpu_buffer->pages = pages.next; 1035 cpu_buffer->pages = pages.next;
1034 list_del(&pages); 1036 list_del(&pages);
1035 1037
1036 cpu_buffer->nr_pages = nr_pages;
1037
1038 rb_check_pages(cpu_buffer); 1038 rb_check_pages(cpu_buffer);
1039 1039
1040 return 0; 1040 return 0;
1041
1042 free_pages:
1043 list_for_each_entry_safe(bpage, tmp, &pages, list) {
1044 list_del_init(&bpage->list);
1045 free_buffer_page(bpage);
1046 }
1047 return -ENOMEM;
1041} 1048}
1042 1049
1043static struct ring_buffer_per_cpu * 1050static struct ring_buffer_per_cpu *
1044rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) 1051rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
@@ -1055,11 +1062,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1055 1062
1056 cpu_buffer->cpu = cpu; 1063 cpu_buffer->cpu = cpu;
1057 cpu_buffer->buffer = buffer; 1064 cpu_buffer->buffer = buffer;
1058 raw_spin_lock_init(&cpu_buffer->reader_lock); 1065 spin_lock_init(&cpu_buffer->reader_lock);
1059 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1066 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1060 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1067 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1061 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1062 init_completion(&cpu_buffer->update_done);
1063 1068
1064 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1069 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1065 GFP_KERNEL, cpu_to_node(cpu)); 1070 GFP_KERNEL, cpu_to_node(cpu));
@@ -1076,9 +1081,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1076 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1077 1082
1078 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1079 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1080 1084
1081 ret = rb_allocate_pages(cpu_buffer, nr_pages); 1085 ret = rb_allocate_pages(cpu_buffer, buffer->pages);
1082 if (ret < 0) 1086 if (ret < 0)
1083 goto fail_free_reader; 1087 goto fail_free_reader;
1084 1088
@@ -1139,7 +1143,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1139{ 1143{
1140 struct ring_buffer *buffer; 1144 struct ring_buffer *buffer;
1141 int bsize; 1145 int bsize;
1142 int cpu, nr_pages; 1146 int cpu;
1143 1147
1144 /* keep it in its own cache line */ 1148 /* keep it in its own cache line */
1145 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1149 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1150,14 +1154,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1150 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) 1154 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
1151 goto fail_free_buffer; 1155 goto fail_free_buffer;
1152 1156
1153 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1157 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1154 buffer->flags = flags; 1158 buffer->flags = flags;
1155 buffer->clock = trace_clock_local; 1159 buffer->clock = trace_clock_local;
1156 buffer->reader_lock_key = key; 1160 buffer->reader_lock_key = key;
1157 1161
1158 /* need at least two pages */ 1162 /* need at least two pages */
1159 if (nr_pages < 2) 1163 if (buffer->pages < 2)
1160 nr_pages = 2; 1164 buffer->pages = 2;
1161 1165
1162 /* 1166 /*
1163 * In case of non-hotplug cpu, if the ring-buffer is allocated 1167 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1180,7 +1184,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1180 1184
1181 for_each_buffer_cpu(buffer, cpu) { 1185 for_each_buffer_cpu(buffer, cpu) {
1182 buffer->buffers[cpu] = 1186 buffer->buffers[cpu] =
1183 rb_allocate_cpu_buffer(buffer, nr_pages, cpu); 1187 rb_allocate_cpu_buffer(buffer, cpu);
1184 if (!buffer->buffers[cpu]) 1188 if (!buffer->buffers[cpu])
1185 goto fail_free_buffers; 1189 goto fail_free_buffers;
1186 } 1190 }
@@ -1248,223 +1252,58 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1248 1252
1249static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1253static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1250 1254
1251static inline unsigned long rb_page_entries(struct buffer_page *bpage) 1255static void
1252{ 1256rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1253 return local_read(&bpage->entries) & RB_WRITE_MASK;
1254}
1255
1256static inline unsigned long rb_page_write(struct buffer_page *bpage)
1257{
1258 return local_read(&bpage->write) & RB_WRITE_MASK;
1259}
1260
1261static int
1262rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1263{ 1257{
1264 struct list_head *tail_page, *to_remove, *next_page; 1258 struct buffer_page *bpage;
1265 struct buffer_page *to_remove_page, *tmp_iter_page; 1259 struct list_head *p;
1266 struct buffer_page *last_page, *first_page; 1260 unsigned i;
1267 unsigned int nr_removed;
1268 unsigned long head_bit;
1269 int page_entries;
1270
1271 head_bit = 0;
1272
1273 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1274 atomic_inc(&cpu_buffer->record_disabled);
1275 /*
1276 * We don't race with the readers since we have acquired the reader
1277 * lock. We also don't race with writers after disabling recording.
1278 * This makes it easy to figure out the first and the last page to be
1279 * removed from the list. We unlink all the pages in between including
1280 * the first and last pages. This is done in a busy loop so that we
1281 * lose the least number of traces.
1282 * The pages are freed after we restart recording and unlock readers.
1283 */
1284 tail_page = &cpu_buffer->tail_page->list;
1285
1286 /*
1287 * tail page might be on reader page, we remove the next page
1288 * from the ring buffer
1289 */
1290 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
1291 tail_page = rb_list_head(tail_page->next);
1292 to_remove = tail_page;
1293 1261
1294 /* start of pages to remove */ 1262 spin_lock_irq(&cpu_buffer->reader_lock);
1295 first_page = list_entry(rb_list_head(to_remove->next), 1263 rb_head_page_deactivate(cpu_buffer);
1296 struct buffer_page, list);
1297 1264
1298 for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { 1265 for (i = 0; i < nr_pages; i++) {
1299 to_remove = rb_list_head(to_remove)->next; 1266 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1300 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; 1267 goto out;
1268 p = cpu_buffer->pages->next;
1269 bpage = list_entry(p, struct buffer_page, list);
1270 list_del_init(&bpage->list);
1271 free_buffer_page(bpage);
1301 } 1272 }
1273 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1274 goto out;
1302 1275
1303 next_page = rb_list_head(to_remove)->next; 1276 rb_reset_cpu(cpu_buffer);
1304 1277 rb_check_pages(cpu_buffer);
1305 /*
1306 * Now we remove all pages between tail_page and next_page.
1307 * Make sure that we have head_bit value preserved for the
1308 * next page
1309 */
1310 tail_page->next = (struct list_head *)((unsigned long)next_page |
1311 head_bit);
1312 next_page = rb_list_head(next_page);
1313 next_page->prev = tail_page;
1314
1315 /* make sure pages points to a valid page in the ring buffer */
1316 cpu_buffer->pages = next_page;
1317
1318 /* update head page */
1319 if (head_bit)
1320 cpu_buffer->head_page = list_entry(next_page,
1321 struct buffer_page, list);
1322
1323 /*
1324 * change read pointer to make sure any read iterators reset
1325 * themselves
1326 */
1327 cpu_buffer->read = 0;
1328
1329 /* pages are removed, resume tracing and then free the pages */
1330 atomic_dec(&cpu_buffer->record_disabled);
1331 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1332
1333 RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
1334
1335 /* last buffer page to remove */
1336 last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
1337 list);
1338 tmp_iter_page = first_page;
1339
1340 do {
1341 to_remove_page = tmp_iter_page;
1342 rb_inc_page(cpu_buffer, &tmp_iter_page);
1343
1344 /* update the counters */
1345 page_entries = rb_page_entries(to_remove_page);
1346 if (page_entries) {
1347 /*
1348 * If something was added to this page, it was full
1349 * since it is not the tail page. So we deduct the
1350 * bytes consumed in ring buffer from here.
1351 * Increment overrun to account for the lost events.
1352 */
1353 local_add(page_entries, &cpu_buffer->overrun);
1354 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1355 }
1356
1357 /*
1358 * We have already removed references to this list item, just
1359 * free up the buffer_page and its page
1360 */
1361 free_buffer_page(to_remove_page);
1362 nr_removed--;
1363
1364 } while (to_remove_page != last_page);
1365
1366 RB_WARN_ON(cpu_buffer, nr_removed);
1367 1278
1368 return nr_removed == 0; 1279out:
1280 spin_unlock_irq(&cpu_buffer->reader_lock);
1369} 1281}
1370 1282
1371static int 1283static void
1372rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) 1284rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1285 struct list_head *pages, unsigned nr_pages)
1373{ 1286{
1374 struct list_head *pages = &cpu_buffer->new_pages; 1287 struct buffer_page *bpage;
1375 int retries, success; 1288 struct list_head *p;
1376 1289 unsigned i;
1377 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1378 /*
1379 * We are holding the reader lock, so the reader page won't be swapped
1380 * in the ring buffer. Now we are racing with the writer trying to
1381 * move head page and the tail page.
1382 * We are going to adapt the reader page update process where:
1383 * 1. We first splice the start and end of list of new pages between
1384 * the head page and its previous page.
1385 * 2. We cmpxchg the prev_page->next to point from head page to the
1386 * start of new pages list.
1387 * 3. Finally, we update the head->prev to the end of new list.
1388 *
1389 * We will try this process 10 times, to make sure that we don't keep
1390 * spinning.
1391 */
1392 retries = 10;
1393 success = 0;
1394 while (retries--) {
1395 struct list_head *head_page, *prev_page, *r;
1396 struct list_head *last_page, *first_page;
1397 struct list_head *head_page_with_bit;
1398
1399 head_page = &rb_set_head_page(cpu_buffer)->list;
1400 if (!head_page)
1401 break;
1402 prev_page = head_page->prev;
1403
1404 first_page = pages->next;
1405 last_page = pages->prev;
1406
1407 head_page_with_bit = (struct list_head *)
1408 ((unsigned long)head_page | RB_PAGE_HEAD);
1409
1410 last_page->next = head_page_with_bit;
1411 first_page->prev = prev_page;
1412
1413 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
1414 1290
1415 if (r == head_page_with_bit) { 1291 spin_lock_irq(&cpu_buffer->reader_lock);
1416 /* 1292 rb_head_page_deactivate(cpu_buffer);
1417 * yay, we replaced the page pointer to our new list,
1418 * now, we just have to update to head page's prev
1419 * pointer to point to end of list
1420 */
1421 head_page->prev = last_page;
1422 success = 1;
1423 break;
1424 }
1425 }
1426 1293
1427 if (success) 1294 for (i = 0; i < nr_pages; i++) {
1428 INIT_LIST_HEAD(pages); 1295 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1429 /* 1296 goto out;
1430 * If we weren't successful in adding in new pages, warn and stop 1297 p = pages->next;
1431 * tracing 1298 bpage = list_entry(p, struct buffer_page, list);
1432 */ 1299 list_del_init(&bpage->list);
1433 RB_WARN_ON(cpu_buffer, !success); 1300 list_add_tail(&bpage->list, cpu_buffer->pages);
1434 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1435
1436 /* free pages if they weren't inserted */
1437 if (!success) {
1438 struct buffer_page *bpage, *tmp;
1439 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1440 list) {
1441 list_del_init(&bpage->list);
1442 free_buffer_page(bpage);
1443 }
1444 } 1301 }
1445 return success; 1302 rb_reset_cpu(cpu_buffer);
1446} 1303 rb_check_pages(cpu_buffer);
1447
1448static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
1449{
1450 int success;
1451
1452 if (cpu_buffer->nr_pages_to_update > 0)
1453 success = rb_insert_pages(cpu_buffer);
1454 else
1455 success = rb_remove_pages(cpu_buffer,
1456 -cpu_buffer->nr_pages_to_update);
1457
1458 if (success)
1459 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
1460}
1461 1304
1462static void update_pages_handler(struct work_struct *work) 1305out:
1463{ 1306 spin_unlock_irq(&cpu_buffer->reader_lock);
1464 struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
1465 struct ring_buffer_per_cpu, update_pages_work);
1466 rb_update_pages(cpu_buffer);
1467 complete(&cpu_buffer->update_done);
1468} 1307}
1469 1308
1470/** 1309/**
@@ -1474,14 +1313,16 @@ static void update_pages_handler(struct work_struct *work)
1474 * 1313 *
1475 * Minimum size is 2 * BUF_PAGE_SIZE. 1314 * Minimum size is 2 * BUF_PAGE_SIZE.
1476 * 1315 *
1477 * Returns 0 on success and < 0 on failure. 1316 * Returns -1 on failure.
1478 */ 1317 */
1479int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, 1318int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1480 int cpu_id)
1481{ 1319{
1482 struct ring_buffer_per_cpu *cpu_buffer; 1320 struct ring_buffer_per_cpu *cpu_buffer;
1483 unsigned nr_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1484 int cpu, err = 0; 1322 struct buffer_page *bpage, *tmp;
1323 unsigned long buffer_size;
1324 LIST_HEAD(pages);
1325 int i, cpu;
1485 1326
1486 /* 1327 /*
1487 * Always succeed at resizing a non-existent buffer: 1328 * Always succeed at resizing a non-existent buffer:
@@ -1489,165 +1330,115 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1489 if (!buffer) 1330 if (!buffer)
1490 return size; 1331 return size;
1491 1332
1492 /* Make sure the requested buffer exists */
1493 if (cpu_id != RING_BUFFER_ALL_CPUS &&
1494 !cpumask_test_cpu(cpu_id, buffer->cpumask))
1495 return size;
1496
1497 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1333 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1498 size *= BUF_PAGE_SIZE; 1334 size *= BUF_PAGE_SIZE;
1335 buffer_size = buffer->pages * BUF_PAGE_SIZE;
1499 1336
1500 /* we need a minimum of two pages */ 1337 /* we need a minimum of two pages */
1501 if (size < BUF_PAGE_SIZE * 2) 1338 if (size < BUF_PAGE_SIZE * 2)
1502 size = BUF_PAGE_SIZE * 2; 1339 size = BUF_PAGE_SIZE * 2;
1503 1340
1504 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1341 if (size == buffer_size)
1342 return size;
1505 1343
1506 /* 1344 atomic_inc(&buffer->record_disabled);
1507 * Don't succeed if resizing is disabled, as a reader might be 1345
1508 * manipulating the ring buffer and is expecting a sane state while 1346 /* Make sure all writers are done with this buffer. */
1509 * this is true. 1347 synchronize_sched();
1510 */
1511 if (atomic_read(&buffer->resize_disabled))
1512 return -EBUSY;
1513 1348
1514 /* prevent another thread from changing buffer sizes */
1515 mutex_lock(&buffer->mutex); 1349 mutex_lock(&buffer->mutex);
1350 get_online_cpus();
1516 1351
1517 if (cpu_id == RING_BUFFER_ALL_CPUS) { 1352 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1518 /* calculate the pages to update */
1519 for_each_buffer_cpu(buffer, cpu) {
1520 cpu_buffer = buffer->buffers[cpu];
1521 1353
1522 cpu_buffer->nr_pages_to_update = nr_pages - 1354 if (size < buffer_size) {
1523 cpu_buffer->nr_pages;
1524 /*
1525 * nothing more to do for removing pages or no update
1526 */
1527 if (cpu_buffer->nr_pages_to_update <= 0)
1528 continue;
1529 /*
1530 * to add pages, make sure all new pages can be
1531 * allocated without receiving ENOMEM
1532 */
1533 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1534 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1535 &cpu_buffer->new_pages, cpu)) {
1536 /* not enough memory for new pages */
1537 err = -ENOMEM;
1538 goto out_err;
1539 }
1540 }
1541 1355
1542 get_online_cpus(); 1356 /* easy case, just free pages */
1543 /* 1357 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
1544 * Fire off all the required work handlers 1358 goto out_fail;
1545 * We can't schedule on offline CPUs, but it's not necessary 1359
1546 * since we can change their buffer sizes without any race. 1360 rm_pages = buffer->pages - nr_pages;
1547 */
1548 for_each_buffer_cpu(buffer, cpu) {
1549 cpu_buffer = buffer->buffers[cpu];
1550 if (!cpu_buffer->nr_pages_to_update)
1551 continue;
1552
1553 if (cpu_online(cpu))
1554 schedule_work_on(cpu,
1555 &cpu_buffer->update_pages_work);
1556 else
1557 rb_update_pages(cpu_buffer);
1558 }
1559 1361
1560 /* wait for all the updates to complete */
1561 for_each_buffer_cpu(buffer, cpu) { 1362 for_each_buffer_cpu(buffer, cpu) {
1562 cpu_buffer = buffer->buffers[cpu]; 1363 cpu_buffer = buffer->buffers[cpu];
1563 if (!cpu_buffer->nr_pages_to_update) 1364 rb_remove_pages(cpu_buffer, rm_pages);
1564 continue;
1565
1566 if (cpu_online(cpu))
1567 wait_for_completion(&cpu_buffer->update_done);
1568 cpu_buffer->nr_pages_to_update = 0;
1569 } 1365 }
1366 goto out;
1367 }
1570 1368
1571 put_online_cpus(); 1369 /*
1572 } else { 1370 * This is a bit more difficult. We only want to add pages
1573 /* Make sure this CPU has been intitialized */ 1371 * when we can allocate enough for all CPUs. We do this
1574 if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) 1372 * by allocating all the pages and storing them on a local
1575 goto out; 1373 * link list. If we succeed in our allocation, then we
1576 1374 * add these pages to the cpu_buffers. Otherwise we just free
1577 cpu_buffer = buffer->buffers[cpu_id]; 1375 * them all and return -ENOMEM;
1578 1376 */
1579 if (nr_pages == cpu_buffer->nr_pages) 1377 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
1580 goto out; 1378 goto out_fail;
1581 1379
1582 cpu_buffer->nr_pages_to_update = nr_pages - 1380 new_pages = nr_pages - buffer->pages;
1583 cpu_buffer->nr_pages;
1584 1381
1585 INIT_LIST_HEAD(&cpu_buffer->new_pages); 1382 for_each_buffer_cpu(buffer, cpu) {
1586 if (cpu_buffer->nr_pages_to_update > 0 && 1383 for (i = 0; i < new_pages; i++) {
1587 __rb_allocate_pages(cpu_buffer->nr_pages_to_update, 1384 struct page *page;
1588 &cpu_buffer->new_pages, cpu_id)) { 1385 /*
1589 err = -ENOMEM; 1386 * __GFP_NORETRY flag makes sure that the allocation
1590 goto out_err; 1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1391 cache_line_size()),
1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1394 if (!bpage)
1395 goto free_pages;
1396 list_add(&bpage->list, &pages);
1397 page = alloc_pages_node(cpu_to_node(cpu),
1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1400 goto free_pages;
1401 bpage->page = page_address(page);
1402 rb_init_page(bpage->page);
1591 } 1403 }
1592
1593 get_online_cpus();
1594
1595 if (cpu_online(cpu_id)) {
1596 schedule_work_on(cpu_id,
1597 &cpu_buffer->update_pages_work);
1598 wait_for_completion(&cpu_buffer->update_done);
1599 } else
1600 rb_update_pages(cpu_buffer);
1601
1602 cpu_buffer->nr_pages_to_update = 0;
1603 put_online_cpus();
1604 } 1404 }
1605 1405
1606 out: 1406 for_each_buffer_cpu(buffer, cpu) {
1607 /* 1407 cpu_buffer = buffer->buffers[cpu];
1608 * The ring buffer resize can happen with the ring buffer 1408 rb_insert_pages(cpu_buffer, &pages, new_pages);
1609 * enabled, so that the update disturbs the tracing as little
1610 * as possible. But if the buffer is disabled, we do not need
1611 * to worry about that, and we can take the time to verify
1612 * that the buffer is not corrupt.
1613 */
1614 if (atomic_read(&buffer->record_disabled)) {
1615 atomic_inc(&buffer->record_disabled);
1616 /*
1617 * Even though the buffer was disabled, we must make sure
1618 * that it is truly disabled before calling rb_check_pages.
1619 * There could have been a race between checking
1620 * record_disable and incrementing it.
1621 */
1622 synchronize_sched();
1623 for_each_buffer_cpu(buffer, cpu) {
1624 cpu_buffer = buffer->buffers[cpu];
1625 rb_check_pages(cpu_buffer);
1626 }
1627 atomic_dec(&buffer->record_disabled);
1628 } 1409 }
1629 1410
1630 mutex_unlock(&buffer->mutex); 1411 if (RB_WARN_ON(buffer, !list_empty(&pages)))
1631 return size; 1412 goto out_fail;
1632 1413
1633 out_err: 1414 out:
1634 for_each_buffer_cpu(buffer, cpu) { 1415 buffer->pages = nr_pages;
1635 struct buffer_page *bpage, *tmp; 1416 put_online_cpus();
1417 mutex_unlock(&buffer->mutex);
1636 1418
1637 cpu_buffer = buffer->buffers[cpu]; 1419 atomic_dec(&buffer->record_disabled);
1638 cpu_buffer->nr_pages_to_update = 0;
1639 1420
1640 if (list_empty(&cpu_buffer->new_pages)) 1421 return size;
1641 continue;
1642 1422
1643 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, 1423 free_pages:
1644 list) { 1424 list_for_each_entry_safe(bpage, tmp, &pages, list) {
1645 list_del_init(&bpage->list); 1425 list_del_init(&bpage->list);
1646 free_buffer_page(bpage); 1426 free_buffer_page(bpage);
1647 }
1648 } 1427 }
1428 put_online_cpus();
1649 mutex_unlock(&buffer->mutex); 1429 mutex_unlock(&buffer->mutex);
1650 return err; 1430 atomic_dec(&buffer->record_disabled);
1431 return -ENOMEM;
1432
1433 /*
1434 * Something went totally wrong, and we are too paranoid
1435 * to even clean up the mess.
1436 */
1437 out_fail:
1438 put_online_cpus();
1439 mutex_unlock(&buffer->mutex);
1440 atomic_dec(&buffer->record_disabled);
1441 return -1;
1651} 1442}
1652EXPORT_SYMBOL_GPL(ring_buffer_resize); 1443EXPORT_SYMBOL_GPL(ring_buffer_resize);
1653 1444
@@ -1686,11 +1477,21 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
1686 return __rb_page_index(iter->head_page, iter->head); 1477 return __rb_page_index(iter->head_page, iter->head);
1687} 1478}
1688 1479
1480static inline unsigned long rb_page_write(struct buffer_page *bpage)
1481{
1482 return local_read(&bpage->write) & RB_WRITE_MASK;
1483}
1484
1689static inline unsigned rb_page_commit(struct buffer_page *bpage) 1485static inline unsigned rb_page_commit(struct buffer_page *bpage)
1690{ 1486{
1691 return local_read(&bpage->page->commit); 1487 return local_read(&bpage->page->commit);
1692} 1488}
1693 1489
1490static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1491{
1492 return local_read(&bpage->entries) & RB_WRITE_MASK;
1493}
1494
1694/* Size is determined by what has been committed */ 1495/* Size is determined by what has been committed */
1695static inline unsigned rb_page_size(struct buffer_page *bpage) 1496static inline unsigned rb_page_size(struct buffer_page *bpage)
1696{ 1497{
@@ -1739,7 +1540,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1739 * assign the commit to the tail. 1540 * assign the commit to the tail.
1740 */ 1541 */
1741 again: 1542 again:
1742 max_count = cpu_buffer->nr_pages * 100; 1543 max_count = cpu_buffer->buffer->pages * 100;
1743 1544
1744 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1545 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1745 if (RB_WARN_ON(cpu_buffer, !(--max_count))) 1546 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -1823,7 +1624,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1823} 1624}
1824 1625
1825/** 1626/**
1826 * rb_update_event - update event type and data 1627 * ring_buffer_update_event - update event type and data
1827 * @event: the even to update 1628 * @event: the even to update
1828 * @type: the type of event 1629 * @type: the type of event
1829 * @length: the size of the event field in the ring buffer 1630 * @length: the size of the event field in the ring buffer
@@ -1907,7 +1708,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1907 * the counters. 1708 * the counters.
1908 */ 1709 */
1909 local_add(entries, &cpu_buffer->overrun); 1710 local_add(entries, &cpu_buffer->overrun);
1910 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1911 1711
1912 /* 1712 /*
1913 * The entries will be zeroed out when we move the 1713 * The entries will be zeroed out when we move the
@@ -2063,9 +1863,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
2063 event = __rb_page_index(tail_page, tail); 1863 event = __rb_page_index(tail_page, tail);
2064 kmemcheck_annotate_bitfield(event, bitfield); 1864 kmemcheck_annotate_bitfield(event, bitfield);
2065 1865
2066 /* account for padding bytes */
2067 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
2068
2069 /* 1866 /*
2070 * Save the original length to the meta data. 1867 * Save the original length to the meta data.
2071 * This will be used by the reader to add lost event 1868 * This will be used by the reader to add lost event
@@ -2158,10 +1955,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2158 * If we are not in overwrite mode, 1955 * If we are not in overwrite mode,
2159 * this is easy, just stop here. 1956 * this is easy, just stop here.
2160 */ 1957 */
2161 if (!(buffer->flags & RB_FL_OVERWRITE)) { 1958 if (!(buffer->flags & RB_FL_OVERWRITE))
2162 local_inc(&cpu_buffer->dropped_events);
2163 goto out_reset; 1959 goto out_reset;
2164 }
2165 1960
2166 ret = rb_handle_head_page(cpu_buffer, 1961 ret = rb_handle_head_page(cpu_buffer,
2167 tail_page, 1962 tail_page,
@@ -2259,9 +2054,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2259 if (!tail) 2054 if (!tail)
2260 tail_page->page->time_stamp = ts; 2055 tail_page->page->time_stamp = ts;
2261 2056
2262 /* account for these added bytes */
2263 local_add(length, &cpu_buffer->entries_bytes);
2264
2265 return event; 2057 return event;
2266} 2058}
2267 2059
@@ -2284,7 +2076,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2284 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 2076 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
2285 unsigned long write_mask = 2077 unsigned long write_mask =
2286 local_read(&bpage->write) & ~RB_WRITE_MASK; 2078 local_read(&bpage->write) & ~RB_WRITE_MASK;
2287 unsigned long event_length = rb_event_length(event);
2288 /* 2079 /*
2289 * This is on the tail page. It is possible that 2080 * This is on the tail page. It is possible that
2290 * a write could come in and move the tail page 2081 * a write could come in and move the tail page
@@ -2294,11 +2085,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2294 old_index += write_mask; 2085 old_index += write_mask;
2295 new_index += write_mask; 2086 new_index += write_mask;
2296 index = local_cmpxchg(&bpage->write, old_index, new_index); 2087 index = local_cmpxchg(&bpage->write, old_index, new_index);
2297 if (index == old_index) { 2088 if (index == old_index)
2298 /* update counters */
2299 local_sub(event_length, &cpu_buffer->entries_bytes);
2300 return 1; 2089 return 1;
2301 }
2302 } 2090 }
2303 2091
2304 /* could not discard */ 2092 /* could not discard */
@@ -2725,8 +2513,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2725 * and not the length of the event which would hold the header. 2513 * and not the length of the event which would hold the header.
2726 */ 2514 */
2727int ring_buffer_write(struct ring_buffer *buffer, 2515int ring_buffer_write(struct ring_buffer *buffer,
2728 unsigned long length, 2516 unsigned long length,
2729 void *data) 2517 void *data)
2730{ 2518{
2731 struct ring_buffer_per_cpu *cpu_buffer; 2519 struct ring_buffer_per_cpu *cpu_buffer;
2732 struct ring_buffer_event *event; 2520 struct ring_buffer_event *event;
@@ -2818,63 +2606,6 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
2818EXPORT_SYMBOL_GPL(ring_buffer_record_enable); 2606EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
2819 2607
2820/** 2608/**
2821 * ring_buffer_record_off - stop all writes into the buffer
2822 * @buffer: The ring buffer to stop writes to.
2823 *
2824 * This prevents all writes to the buffer. Any attempt to write
2825 * to the buffer after this will fail and return NULL.
2826 *
2827 * This is different than ring_buffer_record_disable() as
2828 * it works like an on/off switch, where as the disable() version
2829 * must be paired with a enable().
2830 */
2831void ring_buffer_record_off(struct ring_buffer *buffer)
2832{
2833 unsigned int rd;
2834 unsigned int new_rd;
2835
2836 do {
2837 rd = atomic_read(&buffer->record_disabled);
2838 new_rd = rd | RB_BUFFER_OFF;
2839 } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
2840}
2841EXPORT_SYMBOL_GPL(ring_buffer_record_off);
2842
2843/**
2844 * ring_buffer_record_on - restart writes into the buffer
2845 * @buffer: The ring buffer to start writes to.
2846 *
2847 * This enables all writes to the buffer that was disabled by
2848 * ring_buffer_record_off().
2849 *
2850 * This is different than ring_buffer_record_enable() as
2851 * it works like an on/off switch, where as the enable() version
2852 * must be paired with a disable().
2853 */
2854void ring_buffer_record_on(struct ring_buffer *buffer)
2855{
2856 unsigned int rd;
2857 unsigned int new_rd;
2858
2859 do {
2860 rd = atomic_read(&buffer->record_disabled);
2861 new_rd = rd & ~RB_BUFFER_OFF;
2862 } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
2863}
2864EXPORT_SYMBOL_GPL(ring_buffer_record_on);
2865
2866/**
2867 * ring_buffer_record_is_on - return true if the ring buffer can write
2868 * @buffer: The ring buffer to see if write is enabled
2869 *
2870 * Returns true if the ring buffer is in a state that it accepts writes.
2871 */
2872int ring_buffer_record_is_on(struct ring_buffer *buffer)
2873{
2874 return !atomic_read(&buffer->record_disabled);
2875}
2876
2877/**
2878 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 2609 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
2879 * @buffer: The ring buffer to stop writes to. 2610 * @buffer: The ring buffer to stop writes to.
2880 * @cpu: The CPU buffer to stop 2611 * @cpu: The CPU buffer to stop
@@ -2930,59 +2661,6 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2930} 2661}
2931 2662
2932/** 2663/**
2933 * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
2934 * @buffer: The ring buffer
2935 * @cpu: The per CPU buffer to read from.
2936 */
2937u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2938{
2939 unsigned long flags;
2940 struct ring_buffer_per_cpu *cpu_buffer;
2941 struct buffer_page *bpage;
2942 u64 ret = 0;
2943
2944 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2945 return 0;
2946
2947 cpu_buffer = buffer->buffers[cpu];
2948 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2949 /*
2950 * if the tail is on reader_page, oldest time stamp is on the reader
2951 * page
2952 */
2953 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
2954 bpage = cpu_buffer->reader_page;
2955 else
2956 bpage = rb_set_head_page(cpu_buffer);
2957 if (bpage)
2958 ret = bpage->page->time_stamp;
2959 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2960
2961 return ret;
2962}
2963EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
2964
2965/**
2966 * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
2967 * @buffer: The ring buffer
2968 * @cpu: The per CPU buffer to read from.
2969 */
2970unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
2971{
2972 struct ring_buffer_per_cpu *cpu_buffer;
2973 unsigned long ret;
2974
2975 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2976 return 0;
2977
2978 cpu_buffer = buffer->buffers[cpu];
2979 ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
2980
2981 return ret;
2982}
2983EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
2984
2985/**
2986 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2664 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2987 * @buffer: The ring buffer 2665 * @buffer: The ring buffer
2988 * @cpu: The per CPU buffer to get the entries from. 2666 * @cpu: The per CPU buffer to get the entries from.
@@ -3001,8 +2679,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
3001EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2679EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
3002 2680
3003/** 2681/**
3004 * ring_buffer_overrun_cpu - get the number of overruns caused by the ring 2682 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
3005 * buffer wrapping around (only if RB_FL_OVERWRITE is on).
3006 * @buffer: The ring buffer 2683 * @buffer: The ring buffer
3007 * @cpu: The per CPU buffer to get the number of overruns from 2684 * @cpu: The per CPU buffer to get the number of overruns from
3008 */ 2685 */
@@ -3022,9 +2699,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3022EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2699EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
3023 2700
3024/** 2701/**
3025 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by 2702 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
3026 * commits failing due to the buffer wrapping around while there are uncommitted
3027 * events, such as during an interrupt storm.
3028 * @buffer: The ring buffer 2703 * @buffer: The ring buffer
3029 * @cpu: The per CPU buffer to get the number of overruns from 2704 * @cpu: The per CPU buffer to get the number of overruns from
3030 */ 2705 */
@@ -3045,28 +2720,6 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
3045EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); 2720EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
3046 2721
3047/** 2722/**
3048 * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
3049 * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
3050 * @buffer: The ring buffer
3051 * @cpu: The per CPU buffer to get the number of overruns from
3052 */
3053unsigned long
3054ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3055{
3056 struct ring_buffer_per_cpu *cpu_buffer;
3057 unsigned long ret;
3058
3059 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3060 return 0;
3061
3062 cpu_buffer = buffer->buffers[cpu];
3063 ret = local_read(&cpu_buffer->dropped_events);
3064
3065 return ret;
3066}
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068
3069/**
3070 * ring_buffer_entries - get the number of entries in a buffer 2723 * ring_buffer_entries - get the number of entries in a buffer
3071 * @buffer: The ring buffer 2724 * @buffer: The ring buffer
3072 * 2725 *
@@ -3151,9 +2804,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
3151 2804
3152 cpu_buffer = iter->cpu_buffer; 2805 cpu_buffer = iter->cpu_buffer;
3153 2806
3154 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2807 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3155 rb_iter_reset(iter); 2808 rb_iter_reset(iter);
3156 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2809 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3157} 2810}
3158EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); 2811EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
3159 2812
@@ -3274,10 +2927,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3274 if (cpu_buffer->commit_page == cpu_buffer->reader_page) 2927 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
3275 goto out; 2928 goto out;
3276 2929
3277 /* Don't bother swapping if the ring buffer is empty */
3278 if (rb_num_of_entries(cpu_buffer) == 0)
3279 goto out;
3280
3281 /* 2930 /*
3282 * Reset the reader page to size zero. 2931 * Reset the reader page to size zero.
3283 */ 2932 */
@@ -3291,8 +2940,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3291 * Splice the empty reader page into the list around the head. 2940 * Splice the empty reader page into the list around the head.
3292 */ 2941 */
3293 reader = rb_set_head_page(cpu_buffer); 2942 reader = rb_set_head_page(cpu_buffer);
3294 if (!reader)
3295 goto out;
3296 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); 2943 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
3297 cpu_buffer->reader_page->list.prev = reader->list.prev; 2944 cpu_buffer->reader_page->list.prev = reader->list.prev;
3298 2945
@@ -3618,12 +3265,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3618 again: 3265 again:
3619 local_irq_save(flags); 3266 local_irq_save(flags);
3620 if (dolock) 3267 if (dolock)
3621 raw_spin_lock(&cpu_buffer->reader_lock); 3268 spin_lock(&cpu_buffer->reader_lock);
3622 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3269 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3623 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3270 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3624 rb_advance_reader(cpu_buffer); 3271 rb_advance_reader(cpu_buffer);
3625 if (dolock) 3272 if (dolock)
3626 raw_spin_unlock(&cpu_buffer->reader_lock); 3273 spin_unlock(&cpu_buffer->reader_lock);
3627 local_irq_restore(flags); 3274 local_irq_restore(flags);
3628 3275
3629 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3276 if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3648,9 +3295,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3648 unsigned long flags; 3295 unsigned long flags;
3649 3296
3650 again: 3297 again:
3651 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3298 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3652 event = rb_iter_peek(iter, ts); 3299 event = rb_iter_peek(iter, ts);
3653 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3300 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3654 3301
3655 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3302 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3656 goto again; 3303 goto again;
@@ -3690,7 +3337,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3690 cpu_buffer = buffer->buffers[cpu]; 3337 cpu_buffer = buffer->buffers[cpu];
3691 local_irq_save(flags); 3338 local_irq_save(flags);
3692 if (dolock) 3339 if (dolock)
3693 raw_spin_lock(&cpu_buffer->reader_lock); 3340 spin_lock(&cpu_buffer->reader_lock);
3694 3341
3695 event = rb_buffer_peek(cpu_buffer, ts, lost_events); 3342 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3696 if (event) { 3343 if (event) {
@@ -3699,7 +3346,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3699 } 3346 }
3700 3347
3701 if (dolock) 3348 if (dolock)
3702 raw_spin_unlock(&cpu_buffer->reader_lock); 3349 spin_unlock(&cpu_buffer->reader_lock);
3703 local_irq_restore(flags); 3350 local_irq_restore(flags);
3704 3351
3705 out: 3352 out:
@@ -3749,7 +3396,6 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3749 3396
3750 iter->cpu_buffer = cpu_buffer; 3397 iter->cpu_buffer = cpu_buffer;
3751 3398
3752 atomic_inc(&buffer->resize_disabled);
3753 atomic_inc(&cpu_buffer->record_disabled); 3399 atomic_inc(&cpu_buffer->record_disabled);
3754 3400
3755 return iter; 3401 return iter;
@@ -3792,11 +3438,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
3792 3438
3793 cpu_buffer = iter->cpu_buffer; 3439 cpu_buffer = iter->cpu_buffer;
3794 3440
3795 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3441 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3796 arch_spin_lock(&cpu_buffer->lock); 3442 arch_spin_lock(&cpu_buffer->lock);
3797 rb_iter_reset(iter); 3443 rb_iter_reset(iter);
3798 arch_spin_unlock(&cpu_buffer->lock); 3444 arch_spin_unlock(&cpu_buffer->lock);
3799 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3445 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3800} 3446}
3801EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3447EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3802 3448
@@ -3811,20 +3457,8 @@ void
3811ring_buffer_read_finish(struct ring_buffer_iter *iter) 3457ring_buffer_read_finish(struct ring_buffer_iter *iter)
3812{ 3458{
3813 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3459 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3814 unsigned long flags;
3815
3816 /*
3817 * Ring buffer is disabled from recording, here's a good place
3818 * to check the integrity of the ring buffer.
3819 * Must prevent readers from trying to read, as the check
3820 * clears the HEAD page and readers require it.
3821 */
3822 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3823 rb_check_pages(cpu_buffer);
3824 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3825 3460
3826 atomic_dec(&cpu_buffer->record_disabled); 3461 atomic_dec(&cpu_buffer->record_disabled);
3827 atomic_dec(&cpu_buffer->buffer->resize_disabled);
3828 kfree(iter); 3462 kfree(iter);
3829} 3463}
3830EXPORT_SYMBOL_GPL(ring_buffer_read_finish); 3464EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3843,7 +3477,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3843 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3477 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3844 unsigned long flags; 3478 unsigned long flags;
3845 3479
3846 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3480 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3847 again: 3481 again:
3848 event = rb_iter_peek(iter, ts); 3482 event = rb_iter_peek(iter, ts);
3849 if (!event) 3483 if (!event)
@@ -3854,7 +3488,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
3854 3488
3855 rb_advance_iter(iter); 3489 rb_advance_iter(iter);
3856 out: 3490 out:
3857 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3491 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3858 3492
3859 return event; 3493 return event;
3860} 3494}
@@ -3864,18 +3498,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
3864 * ring_buffer_size - return the size of the ring buffer (in bytes) 3498 * ring_buffer_size - return the size of the ring buffer (in bytes)
3865 * @buffer: The ring buffer. 3499 * @buffer: The ring buffer.
3866 */ 3500 */
3867unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) 3501unsigned long ring_buffer_size(struct ring_buffer *buffer)
3868{ 3502{
3869 /* 3503 return BUF_PAGE_SIZE * buffer->pages;
3870 * Earlier, this method returned
3871 * BUF_PAGE_SIZE * buffer->nr_pages
3872 * Since the nr_pages field is now removed, we have converted this to
3873 * return the per cpu buffer value.
3874 */
3875 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3876 return 0;
3877
3878 return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
3879} 3504}
3880EXPORT_SYMBOL_GPL(ring_buffer_size); 3505EXPORT_SYMBOL_GPL(ring_buffer_size);
3881 3506
@@ -3896,21 +3521,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3896 cpu_buffer->commit_page = cpu_buffer->head_page; 3521 cpu_buffer->commit_page = cpu_buffer->head_page;
3897 3522
3898 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 3523 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3899 INIT_LIST_HEAD(&cpu_buffer->new_pages);
3900 local_set(&cpu_buffer->reader_page->write, 0); 3524 local_set(&cpu_buffer->reader_page->write, 0);
3901 local_set(&cpu_buffer->reader_page->entries, 0); 3525 local_set(&cpu_buffer->reader_page->entries, 0);
3902 local_set(&cpu_buffer->reader_page->page->commit, 0); 3526 local_set(&cpu_buffer->reader_page->page->commit, 0);
3903 cpu_buffer->reader_page->read = 0; 3527 cpu_buffer->reader_page->read = 0;
3904 3528
3905 local_set(&cpu_buffer->entries_bytes, 0);
3906 local_set(&cpu_buffer->overrun, 0);
3907 local_set(&cpu_buffer->commit_overrun, 0); 3529 local_set(&cpu_buffer->commit_overrun, 0);
3908 local_set(&cpu_buffer->dropped_events, 0); 3530 local_set(&cpu_buffer->overrun, 0);
3909 local_set(&cpu_buffer->entries, 0); 3531 local_set(&cpu_buffer->entries, 0);
3910 local_set(&cpu_buffer->committing, 0); 3532 local_set(&cpu_buffer->committing, 0);
3911 local_set(&cpu_buffer->commits, 0); 3533 local_set(&cpu_buffer->commits, 0);
3912 cpu_buffer->read = 0; 3534 cpu_buffer->read = 0;
3913 cpu_buffer->read_bytes = 0;
3914 3535
3915 cpu_buffer->write_stamp = 0; 3536 cpu_buffer->write_stamp = 0;
3916 cpu_buffer->read_stamp = 0; 3537 cpu_buffer->read_stamp = 0;
@@ -3934,13 +3555,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3934 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3555 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3935 return; 3556 return;
3936 3557
3937 atomic_inc(&buffer->resize_disabled);
3938 atomic_inc(&cpu_buffer->record_disabled); 3558 atomic_inc(&cpu_buffer->record_disabled);
3939 3559
3940 /* Make sure all commits have finished */ 3560 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3941 synchronize_sched();
3942
3943 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3944 3561
3945 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3562 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3946 goto out; 3563 goto out;
@@ -3952,10 +3569,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3952 arch_spin_unlock(&cpu_buffer->lock); 3569 arch_spin_unlock(&cpu_buffer->lock);
3953 3570
3954 out: 3571 out:
3955 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3572 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3956 3573
3957 atomic_dec(&cpu_buffer->record_disabled); 3574 atomic_dec(&cpu_buffer->record_disabled);
3958 atomic_dec(&buffer->resize_disabled);
3959} 3575}
3960EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 3576EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3961 3577
@@ -3991,10 +3607,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
3991 cpu_buffer = buffer->buffers[cpu]; 3607 cpu_buffer = buffer->buffers[cpu];
3992 local_irq_save(flags); 3608 local_irq_save(flags);
3993 if (dolock) 3609 if (dolock)
3994 raw_spin_lock(&cpu_buffer->reader_lock); 3610 spin_lock(&cpu_buffer->reader_lock);
3995 ret = rb_per_cpu_empty(cpu_buffer); 3611 ret = rb_per_cpu_empty(cpu_buffer);
3996 if (dolock) 3612 if (dolock)
3997 raw_spin_unlock(&cpu_buffer->reader_lock); 3613 spin_unlock(&cpu_buffer->reader_lock);
3998 local_irq_restore(flags); 3614 local_irq_restore(flags);
3999 3615
4000 if (!ret) 3616 if (!ret)
@@ -4025,10 +3641,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
4025 cpu_buffer = buffer->buffers[cpu]; 3641 cpu_buffer = buffer->buffers[cpu];
4026 local_irq_save(flags); 3642 local_irq_save(flags);
4027 if (dolock) 3643 if (dolock)
4028 raw_spin_lock(&cpu_buffer->reader_lock); 3644 spin_lock(&cpu_buffer->reader_lock);
4029 ret = rb_per_cpu_empty(cpu_buffer); 3645 ret = rb_per_cpu_empty(cpu_buffer);
4030 if (dolock) 3646 if (dolock)
4031 raw_spin_unlock(&cpu_buffer->reader_lock); 3647 spin_unlock(&cpu_buffer->reader_lock);
4032 local_irq_restore(flags); 3648 local_irq_restore(flags);
4033 3649
4034 return ret; 3650 return ret;
@@ -4057,11 +3673,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
4057 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 3673 !cpumask_test_cpu(cpu, buffer_b->cpumask))
4058 goto out; 3674 goto out;
4059 3675
4060 cpu_buffer_a = buffer_a->buffers[cpu];
4061 cpu_buffer_b = buffer_b->buffers[cpu];
4062
4063 /* At least make sure the two buffers are somewhat the same */ 3676 /* At least make sure the two buffers are somewhat the same */
4064 if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) 3677 if (buffer_a->pages != buffer_b->pages)
4065 goto out; 3678 goto out;
4066 3679
4067 ret = -EAGAIN; 3680 ret = -EAGAIN;
@@ -4075,6 +3688,9 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
4075 if (atomic_read(&buffer_b->record_disabled)) 3688 if (atomic_read(&buffer_b->record_disabled))
4076 goto out; 3689 goto out;
4077 3690
3691 cpu_buffer_a = buffer_a->buffers[cpu];
3692 cpu_buffer_b = buffer_b->buffers[cpu];
3693
4078 if (atomic_read(&cpu_buffer_a->record_disabled)) 3694 if (atomic_read(&cpu_buffer_a->record_disabled))
4079 goto out; 3695 goto out;
4080 3696
@@ -4225,7 +3841,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
4225 if (!bpage) 3841 if (!bpage)
4226 goto out; 3842 goto out;
4227 3843
4228 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3844 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
4229 3845
4230 reader = rb_get_reader_page(cpu_buffer); 3846 reader = rb_get_reader_page(cpu_buffer);
4231 if (!reader) 3847 if (!reader)
@@ -4302,7 +3918,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
4302 } else { 3918 } else {
4303 /* update the entry counter */ 3919 /* update the entry counter */
4304 cpu_buffer->read += rb_page_entries(reader); 3920 cpu_buffer->read += rb_page_entries(reader);
4305 cpu_buffer->read_bytes += BUF_PAGE_SIZE;
4306 3921
4307 /* swap the pages */ 3922 /* swap the pages */
4308 rb_init_page(bpage); 3923 rb_init_page(bpage);
@@ -4349,13 +3964,75 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
4349 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); 3964 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
4350 3965
4351 out_unlock: 3966 out_unlock:
4352 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3967 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
4353 3968
4354 out: 3969 out:
4355 return ret; 3970 return ret;
4356} 3971}
4357EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3972EXPORT_SYMBOL_GPL(ring_buffer_read_page);
4358 3973
3974#ifdef CONFIG_TRACING
3975static ssize_t
3976rb_simple_read(struct file *filp, char __user *ubuf,
3977 size_t cnt, loff_t *ppos)
3978{
3979 unsigned long *p = filp->private_data;
3980 char buf[64];
3981 int r;
3982
3983 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
3984 r = sprintf(buf, "permanently disabled\n");
3985 else
3986 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
3987
3988 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3989}
3990
3991static ssize_t
3992rb_simple_write(struct file *filp, const char __user *ubuf,
3993 size_t cnt, loff_t *ppos)
3994{
3995 unsigned long *p = filp->private_data;
3996 unsigned long val;
3997 int ret;
3998
3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4000 if (ret)
4001 return ret;
4002
4003 if (val)
4004 set_bit(RB_BUFFERS_ON_BIT, p);
4005 else
4006 clear_bit(RB_BUFFERS_ON_BIT, p);
4007
4008 (*ppos)++;
4009
4010 return cnt;
4011}
4012
4013static const struct file_operations rb_simple_fops = {
4014 .open = tracing_open_generic,
4015 .read = rb_simple_read,
4016 .write = rb_simple_write,
4017 .llseek = default_llseek,
4018};
4019
4020
4021static __init int rb_init_debugfs(void)
4022{
4023 struct dentry *d_tracer;
4024
4025 d_tracer = tracing_init_dentry();
4026
4027 trace_create_file("tracing_on", 0644, d_tracer,
4028 &ring_buffer_flags, &rb_simple_fops);
4029
4030 return 0;
4031}
4032
4033fs_initcall(rb_init_debugfs);
4034#endif
4035
4359#ifdef CONFIG_HOTPLUG_CPU 4036#ifdef CONFIG_HOTPLUG_CPU
4360static int rb_cpu_notify(struct notifier_block *self, 4037static int rb_cpu_notify(struct notifier_block *self,
4361 unsigned long action, void *hcpu) 4038 unsigned long action, void *hcpu)
@@ -4363,8 +4040,6 @@ static int rb_cpu_notify(struct notifier_block *self,
4363 struct ring_buffer *buffer = 4040 struct ring_buffer *buffer =
4364 container_of(self, struct ring_buffer, cpu_notify); 4041 container_of(self, struct ring_buffer, cpu_notify);
4365 long cpu = (long)hcpu; 4042 long cpu = (long)hcpu;
4366 int cpu_i, nr_pages_same;
4367 unsigned int nr_pages;
4368 4043
4369 switch (action) { 4044 switch (action) {
4370 case CPU_UP_PREPARE: 4045 case CPU_UP_PREPARE:
@@ -4372,23 +4047,8 @@ static int rb_cpu_notify(struct notifier_block *self,
4372 if (cpumask_test_cpu(cpu, buffer->cpumask)) 4047 if (cpumask_test_cpu(cpu, buffer->cpumask))
4373 return NOTIFY_OK; 4048 return NOTIFY_OK;
4374 4049
4375 nr_pages = 0;
4376 nr_pages_same = 1;
4377 /* check if all cpu sizes are same */
4378 for_each_buffer_cpu(buffer, cpu_i) {
4379 /* fill in the size from first enabled cpu */
4380 if (nr_pages == 0)
4381 nr_pages = buffer->buffers[cpu_i]->nr_pages;
4382 if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
4383 nr_pages_same = 0;
4384 break;
4385 }
4386 }
4387 /* allocate minimum pages, user can later expand it */
4388 if (!nr_pages_same)
4389 nr_pages = 2;
4390 buffer->buffers[cpu] = 4050 buffer->buffers[cpu] =
4391 rb_allocate_cpu_buffer(buffer, nr_pages, cpu); 4051 rb_allocate_cpu_buffer(buffer, cpu);
4392 if (!buffer->buffers[cpu]) { 4052 if (!buffer->buffers[cpu]) {
4393 WARN(1, "failed to allocate ring buffer on CPU %ld\n", 4053 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
4394 cpu); 4054 cpu);
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
deleted file mode 100644
index 4b3b5eaf94d..00000000000
--- a/kernel/trace/rpm-traces.c
+++ /dev/null
@@ -1,20 +0,0 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/usb.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/rpm.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
18EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
19EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
20EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d2..17a2d44e1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Based on code from the latency_tracer, that is: 10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 Nadia Yvette Chambers 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <generated/utsrelease.h> 15#include <generated/utsrelease.h>
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/hardirq.h> 24#include <linux/hardirq.h>
@@ -37,7 +36,6 @@
37#include <linux/ctype.h> 36#include <linux/ctype.h>
38#include <linux/init.h> 37#include <linux/init.h>
39#include <linux/poll.h> 38#include <linux/poll.h>
40#include <linux/nmi.h>
41#include <linux/fs.h> 39#include <linux/fs.h>
42 40
43#include "trace.h" 41#include "trace.h"
@@ -79,21 +77,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
79} 77}
80 78
81/* 79/*
82 * To prevent the comm cache from being overwritten when no
83 * tracing is active, only save the comm when a trace event
84 * occurred.
85 */
86static DEFINE_PER_CPU(bool, trace_cmdline_save);
87
88/*
89 * When a reader is waiting for data, then this variable is
90 * set to true.
91 */
92static bool trace_wakeup_needed;
93
94static struct irq_work trace_work_wakeup;
95
96/*
97 * Kill all tracing for good (never come back). 80 * Kill all tracing for good (never come back).
98 * It is initialized to 1 but will turn to zero if the initialization 81 * It is initialized to 1 but will turn to zero if the initialization
99 * of the tracer is successful. But that is the only place that sets 82 * of the tracer is successful. But that is the only place that sets
@@ -103,6 +86,18 @@ static int tracing_disabled = 1;
103 86
104DEFINE_PER_CPU(int, ftrace_cpu_disabled); 87DEFINE_PER_CPU(int, ftrace_cpu_disabled);
105 88
89static inline void ftrace_disable_cpu(void)
90{
91 preempt_disable();
92 __this_cpu_inc(ftrace_cpu_disabled);
93}
94
95static inline void ftrace_enable_cpu(void)
96{
97 __this_cpu_dec(ftrace_cpu_disabled);
98 preempt_enable();
99}
100
106cpumask_var_t __read_mostly tracing_buffer_mask; 101cpumask_var_t __read_mostly tracing_buffer_mask;
107 102
108/* 103/*
@@ -155,18 +150,6 @@ static int __init set_ftrace_dump_on_oops(char *str)
155} 150}
156__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 151__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
157 152
158
159static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
160static char *trace_boot_options __initdata;
161
162static int __init set_trace_boot_options(char *str)
163{
164 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
165 trace_boot_options = trace_boot_options_buf;
166 return 0;
167}
168__setup("trace_options=", set_trace_boot_options);
169
170unsigned long long ns2usecs(cycle_t nsec) 153unsigned long long ns2usecs(cycle_t nsec)
171{ 154{
172 nsec += 500; 155 nsec += 500;
@@ -226,9 +209,20 @@ static struct trace_array max_tr;
226 209
227static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); 210static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
228 211
212/* tracer_enabled is used to toggle activation of a tracer */
213static int tracer_enabled = 1;
214
215/**
216 * tracing_is_enabled - return tracer_enabled status
217 *
218 * This function is used by other tracers to know the status
219 * of the tracer_enabled flag. Tracers may use this function
220 * to know if it should enable their features when starting
221 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
222 */
229int tracing_is_enabled(void) 223int tracing_is_enabled(void)
230{ 224{
231 return tracing_is_on(); 225 return tracer_enabled;
232} 226}
233 227
234/* 228/*
@@ -344,77 +338,33 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
348 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
349 342
350static int trace_stop_count; 343static int trace_stop_count;
351static DEFINE_RAW_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
352 345
353/** 346static void wakeup_work_handler(struct work_struct *work)
354 * trace_wake_up - wake up tasks waiting for trace input
355 *
356 * Schedules a delayed work to wake up any task that is blocked on the
357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
359 */
360static void trace_wake_up(struct irq_work *work)
361{ 347{
362 wake_up_all(&trace_wait); 348 wake_up(&trace_wait);
363
364} 349}
365 350
366/** 351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
367 * tracing_on - enable tracing buffers
368 *
369 * This function enables tracing buffers that may have been
370 * disabled with tracing_off.
371 */
372void tracing_on(void)
373{
374 if (global_trace.buffer)
375 ring_buffer_record_on(global_trace.buffer);
376 /*
377 * This flag is only looked at when buffers haven't been
378 * allocated yet. We don't really care about the race
379 * between setting this flag and actually turning
380 * on the buffer.
381 */
382 global_trace.buffer_disabled = 0;
383}
384EXPORT_SYMBOL_GPL(tracing_on);
385 352
386/** 353/**
387 * tracing_off - turn off tracing buffers 354 * trace_wake_up - wake up tasks waiting for trace input
388 * 355 *
389 * This function stops the tracing buffers from recording data. 356 * Schedules a delayed work to wake up any task that is blocked on the
390 * It does not disable any overhead the tracers themselves may 357 * trace_wait queue. These is used with trace_poll for tasks polling the
391 * be causing. This function simply causes all recording to 358 * trace.
392 * the ring buffers to fail.
393 */ 359 */
394void tracing_off(void) 360void trace_wake_up(void)
395{ 361{
396 if (global_trace.buffer) 362 const unsigned long delay = msecs_to_jiffies(2);
397 ring_buffer_record_off(global_trace.buffer);
398 /*
399 * This flag is only looked at when buffers haven't been
400 * allocated yet. We don't really care about the race
401 * between setting this flag and actually turning
402 * on the buffer.
403 */
404 global_trace.buffer_disabled = 1;
405}
406EXPORT_SYMBOL_GPL(tracing_off);
407 363
408/** 364 if (trace_flags & TRACE_ITER_BLOCK)
409 * tracing_is_on - show state of ring buffers enabled 365 return;
410 */ 366 schedule_delayed_work(&wakeup_work, delay);
411int tracing_is_on(void)
412{
413 if (global_trace.buffer)
414 return ring_buffer_record_is_on(global_trace.buffer);
415 return !global_trace.buffer_disabled;
416} 367}
417EXPORT_SYMBOL_GPL(tracing_is_on);
418 368
419static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
420{ 370{
@@ -433,15 +383,15 @@ __setup("trace_buf_size=", set_buf_size);
433 383
434static int __init set_tracing_thresh(char *str) 384static int __init set_tracing_thresh(char *str)
435{ 385{
436 unsigned long threshold; 386 unsigned long threshhold;
437 int ret; 387 int ret;
438 388
439 if (!str) 389 if (!str)
440 return 0; 390 return 0;
441 ret = kstrtoul(str, 0, &threshold); 391 ret = strict_strtoul(str, 0, &threshhold);
442 if (ret < 0) 392 if (ret < 0)
443 return 0; 393 return 0;
444 tracing_thresh = threshold * 1000; 394 tracing_thresh = threshhold * 1000;
445 return 1; 395 return 1;
446} 396}
447__setup("tracing_thresh=", set_tracing_thresh); 397__setup("tracing_thresh=", set_tracing_thresh);
@@ -476,20 +426,15 @@ static const char *trace_options[] = {
476 "record-cmd", 426 "record-cmd",
477 "overwrite", 427 "overwrite",
478 "disable_on_free", 428 "disable_on_free",
479 "irq-info",
480 "markers",
481 NULL 429 NULL
482}; 430};
483 431
484static struct { 432static struct {
485 u64 (*func)(void); 433 u64 (*func)(void);
486 const char *name; 434 const char *name;
487 int in_ns; /* is this clock in nanoseconds? */
488} trace_clocks[] = { 435} trace_clocks[] = {
489 { trace_clock_local, "local", 1 }, 436 { trace_clock_local, "local" },
490 { trace_clock_global, "global", 1 }, 437 { trace_clock_global, "global" },
491 { trace_clock_counter, "counter", 0 },
492 ARCH_TRACE_CLOCKS
493}; 438};
494 439
495int trace_clock_id; 440int trace_clock_id;
@@ -627,6 +572,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
627static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 572static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
628{ 573{
629 int len; 574 int len;
575 void *ret;
630 576
631 if (s->len <= s->readpos) 577 if (s->len <= s->readpos)
632 return -EBUSY; 578 return -EBUSY;
@@ -634,7 +580,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
634 len = s->len - s->readpos; 580 len = s->len - s->readpos;
635 if (cnt > len) 581 if (cnt > len)
636 cnt = len; 582 cnt = len;
637 memcpy(buf, s->buffer + s->readpos, cnt); 583 ret = memcpy(buf, s->buffer + s->readpos, cnt);
584 if (!ret)
585 return -EFAULT;
638 586
639 s->readpos += cnt; 587 s->readpos += cnt;
640 return cnt; 588 return cnt;
@@ -746,6 +694,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
746 694
747 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
748 696
697 ftrace_disable_cpu();
698
749 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 699 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
750 700
751 if (ret == -EBUSY) { 701 if (ret == -EBUSY) {
@@ -759,6 +709,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
759 "Failed to swap buffers due to commit in progress\n"); 709 "Failed to swap buffers due to commit in progress\n");
760 } 710 }
761 711
712 ftrace_enable_cpu();
713
762 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 714 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
763 715
764 __update_max_tr(tr, tsk, cpu); 716 __update_max_tr(tr, tsk, cpu);
@@ -766,40 +718,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
766} 718}
767#endif /* CONFIG_TRACER_MAX_TRACE */ 719#endif /* CONFIG_TRACER_MAX_TRACE */
768 720
769static void default_wait_pipe(struct trace_iterator *iter)
770{
771 DEFINE_WAIT(wait);
772
773 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
774
775 /*
776 * The events can happen in critical sections where
777 * checking a work queue can cause deadlocks.
778 * After adding a task to the queue, this flag is set
779 * only to notify events to try to wake up the queue
780 * using irq_work.
781 *
782 * We don't clear it even if the buffer is no longer
783 * empty. The flag only causes the next event to run
784 * irq_work to do the work queue wake up. The worse
785 * that can happen if we race with !trace_empty() is that
786 * an event will cause an irq_work to try to wake up
787 * an empty queue.
788 *
789 * There's no reason to protect this flag either, as
790 * the work queue and irq_work logic will do the necessary
791 * synchronization for the wake ups. The only thing
792 * that is necessary is that the wake up happens after
793 * a task has been queued. It's OK for spurious wake ups.
794 */
795 trace_wakeup_needed = true;
796
797 if (trace_empty(iter))
798 schedule();
799
800 finish_wait(&trace_wait, &wait);
801}
802
803/** 721/**
804 * register_tracer - register a tracer with the ftrace system. 722 * register_tracer - register a tracer with the ftrace system.
805 * @type - the plugin for the tracer 723 * @type - the plugin for the tracer
@@ -807,6 +725,8 @@ static void default_wait_pipe(struct trace_iterator *iter)
807 * Register a new plugin tracer. 725 * Register a new plugin tracer.
808 */ 726 */
809int register_tracer(struct tracer *type) 727int register_tracer(struct tracer *type)
728__releases(kernel_lock)
729__acquires(kernel_lock)
810{ 730{
811 struct tracer *t; 731 struct tracer *t;
812 int ret = 0; 732 int ret = 0;
@@ -864,8 +784,7 @@ int register_tracer(struct tracer *type)
864 784
865 /* If we expanded the buffers, make sure the max is expanded too */ 785 /* If we expanded the buffers, make sure the max is expanded too */
866 if (ring_buffer_expanded && type->use_max_tr) 786 if (ring_buffer_expanded && type->use_max_tr)
867 ring_buffer_resize(max_tr.buffer, trace_buf_size, 787 ring_buffer_resize(max_tr.buffer, trace_buf_size);
868 RING_BUFFER_ALL_CPUS);
869 788
870 /* the test is responsible for initializing and enabling */ 789 /* the test is responsible for initializing and enabling */
871 pr_info("Testing tracer %s: ", type->name); 790 pr_info("Testing tracer %s: ", type->name);
@@ -874,8 +793,6 @@ int register_tracer(struct tracer *type)
874 current_trace = saved_tracer; 793 current_trace = saved_tracer;
875 if (ret) { 794 if (ret) {
876 printk(KERN_CONT "FAILED!\n"); 795 printk(KERN_CONT "FAILED!\n");
877 /* Add the warning after printing 'FAILED' */
878 WARN_ON(1);
879 goto out; 796 goto out;
880 } 797 }
881 /* Only reset on passing, to avoid touching corrupted buffers */ 798 /* Only reset on passing, to avoid touching corrupted buffers */
@@ -883,8 +800,7 @@ int register_tracer(struct tracer *type)
883 800
884 /* Shrink the max buffer again */ 801 /* Shrink the max buffer again */
885 if (ring_buffer_expanded && type->use_max_tr) 802 if (ring_buffer_expanded && type->use_max_tr)
886 ring_buffer_resize(max_tr.buffer, 1, 803 ring_buffer_resize(max_tr.buffer, 1);
887 RING_BUFFER_ALL_CPUS);
888 804
889 printk(KERN_CONT "PASSED\n"); 805 printk(KERN_CONT "PASSED\n");
890 } 806 }
@@ -918,6 +834,39 @@ int register_tracer(struct tracer *type)
918 return ret; 834 return ret;
919} 835}
920 836
837void unregister_tracer(struct tracer *type)
838{
839 struct tracer **t;
840
841 mutex_lock(&trace_types_lock);
842 for (t = &trace_types; *t; t = &(*t)->next) {
843 if (*t == type)
844 goto found;
845 }
846 pr_info("Tracer %s not registered\n", type->name);
847 goto out;
848
849 found:
850 *t = (*t)->next;
851
852 if (type == current_trace && tracer_enabled) {
853 tracer_enabled = 0;
854 tracing_stop();
855 if (current_trace->stop)
856 current_trace->stop(&global_trace);
857 current_trace = &nop_trace;
858 }
859out:
860 mutex_unlock(&trace_types_lock);
861}
862
863static void __tracing_reset(struct ring_buffer *buffer, int cpu)
864{
865 ftrace_disable_cpu();
866 ring_buffer_reset_cpu(buffer, cpu);
867 ftrace_enable_cpu();
868}
869
921void tracing_reset(struct trace_array *tr, int cpu) 870void tracing_reset(struct trace_array *tr, int cpu)
922{ 871{
923 struct ring_buffer *buffer = tr->buffer; 872 struct ring_buffer *buffer = tr->buffer;
@@ -926,7 +875,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
926 875
927 /* Make sure all commits have finished */ 876 /* Make sure all commits have finished */
928 synchronize_sched(); 877 synchronize_sched();
929 ring_buffer_reset_cpu(buffer, cpu); 878 __tracing_reset(buffer, cpu);
930 879
931 ring_buffer_record_enable(buffer); 880 ring_buffer_record_enable(buffer);
932} 881}
@@ -944,7 +893,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
944 tr->time_start = ftrace_now(tr->cpu); 893 tr->time_start = ftrace_now(tr->cpu);
945 894
946 for_each_online_cpu(cpu) 895 for_each_online_cpu(cpu)
947 ring_buffer_reset_cpu(buffer, cpu); 896 __tracing_reset(buffer, cpu);
948 897
949 ring_buffer_record_enable(buffer); 898 ring_buffer_record_enable(buffer);
950} 899}
@@ -1011,7 +960,7 @@ void tracing_start(void)
1011 if (tracing_disabled) 960 if (tracing_disabled)
1012 return; 961 return;
1013 962
1014 raw_spin_lock_irqsave(&tracing_start_lock, flags); 963 spin_lock_irqsave(&tracing_start_lock, flags);
1015 if (--trace_stop_count) { 964 if (--trace_stop_count) {
1016 if (trace_stop_count < 0) { 965 if (trace_stop_count < 0) {
1017 /* Someone screwed up their debugging */ 966 /* Someone screwed up their debugging */
@@ -1036,7 +985,7 @@ void tracing_start(void)
1036 985
1037 ftrace_start(); 986 ftrace_start();
1038 out: 987 out:
1039 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 988 spin_unlock_irqrestore(&tracing_start_lock, flags);
1040} 989}
1041 990
1042/** 991/**
@@ -1051,7 +1000,7 @@ void tracing_stop(void)
1051 unsigned long flags; 1000 unsigned long flags;
1052 1001
1053 ftrace_stop(); 1002 ftrace_stop();
1054 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1003 spin_lock_irqsave(&tracing_start_lock, flags);
1055 if (trace_stop_count++) 1004 if (trace_stop_count++)
1056 goto out; 1005 goto out;
1057 1006
@@ -1069,7 +1018,7 @@ void tracing_stop(void)
1069 arch_spin_unlock(&ftrace_max_lock); 1018 arch_spin_unlock(&ftrace_max_lock);
1070 1019
1071 out: 1020 out:
1072 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1021 spin_unlock_irqrestore(&tracing_start_lock, flags);
1073} 1022}
1074 1023
1075void trace_stop_cmdline_recording(void); 1024void trace_stop_cmdline_recording(void);
@@ -1148,14 +1097,10 @@ void trace_find_cmdline(int pid, char comm[])
1148 1097
1149void tracing_record_cmdline(struct task_struct *tsk) 1098void tracing_record_cmdline(struct task_struct *tsk)
1150{ 1099{
1151 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) 1100 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
1101 !tracing_is_on())
1152 return; 1102 return;
1153 1103
1154 if (!__this_cpu_read(trace_cmdline_save))
1155 return;
1156
1157 __this_cpu_write(trace_cmdline_save, false);
1158
1159 trace_save_cmdline(tsk); 1104 trace_save_cmdline(tsk);
1160} 1105}
1161 1106
@@ -1199,36 +1144,27 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
1199 return event; 1144 return event;
1200} 1145}
1201 1146
1202void
1203__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1204{
1205 __this_cpu_write(trace_cmdline_save, true);
1206 if (trace_wakeup_needed) {
1207 trace_wakeup_needed = false;
1208 /* irq_work_queue() supplies it's own memory barriers */
1209 irq_work_queue(&trace_work_wakeup);
1210 }
1211 ring_buffer_unlock_commit(buffer, event);
1212}
1213
1214static inline void 1147static inline void
1215__trace_buffer_unlock_commit(struct ring_buffer *buffer, 1148__trace_buffer_unlock_commit(struct ring_buffer *buffer,
1216 struct ring_buffer_event *event, 1149 struct ring_buffer_event *event,
1217 unsigned long flags, int pc) 1150 unsigned long flags, int pc,
1151 int wake)
1218{ 1152{
1219 __buffer_unlock_commit(buffer, event); 1153 ring_buffer_unlock_commit(buffer, event);
1220 1154
1221 ftrace_trace_stack(buffer, flags, 6, pc); 1155 ftrace_trace_stack(buffer, flags, 6, pc);
1222 ftrace_trace_userstack(buffer, flags, pc); 1156 ftrace_trace_userstack(buffer, flags, pc);
1157
1158 if (wake)
1159 trace_wake_up();
1223} 1160}
1224 1161
1225void trace_buffer_unlock_commit(struct ring_buffer *buffer, 1162void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1226 struct ring_buffer_event *event, 1163 struct ring_buffer_event *event,
1227 unsigned long flags, int pc) 1164 unsigned long flags, int pc)
1228{ 1165{
1229 __trace_buffer_unlock_commit(buffer, event, flags, pc); 1166 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
1230} 1167}
1231EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1232 1168
1233struct ring_buffer_event * 1169struct ring_buffer_event *
1234trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1170trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1245,21 +1181,29 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1245 struct ring_buffer_event *event, 1181 struct ring_buffer_event *event,
1246 unsigned long flags, int pc) 1182 unsigned long flags, int pc)
1247{ 1183{
1248 __trace_buffer_unlock_commit(buffer, event, flags, pc); 1184 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
1249} 1185}
1250EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1186EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
1251 1187
1252void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, 1188void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1253 struct ring_buffer_event *event, 1189 struct ring_buffer_event *event,
1254 unsigned long flags, int pc, 1190 unsigned long flags, int pc)
1255 struct pt_regs *regs) 1191{
1192 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
1193}
1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1256{ 1200{
1257 __buffer_unlock_commit(buffer, event); 1201 ring_buffer_unlock_commit(buffer, event);
1258 1202
1259 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); 1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1260 ftrace_trace_userstack(buffer, flags, pc); 1204 ftrace_trace_userstack(buffer, flags, pc);
1261} 1205}
1262EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); 1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1263 1207
1264void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1265 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
@@ -1291,7 +1235,7 @@ trace_function(struct trace_array *tr,
1291 entry->parent_ip = parent_ip; 1235 entry->parent_ip = parent_ip;
1292 1236
1293 if (!filter_check_discard(call, entry, buffer, event)) 1237 if (!filter_check_discard(call, entry, buffer, event))
1294 __buffer_unlock_commit(buffer, event); 1238 ring_buffer_unlock_commit(buffer, event);
1295} 1239}
1296 1240
1297void 1241void
@@ -1384,7 +1328,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1384 entry->size = trace.nr_entries; 1328 entry->size = trace.nr_entries;
1385 1329
1386 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1387 __buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1388 1332
1389 out: 1333 out:
1390 /* Again, don't let gcc optimize things here */ 1334 /* Again, don't let gcc optimize things here */
@@ -1480,7 +1424,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1480 1424
1481 save_stack_trace_user(&trace); 1425 save_stack_trace_user(&trace);
1482 if (!filter_check_discard(call, entry, buffer, event)) 1426 if (!filter_check_discard(call, entry, buffer, event))
1483 __buffer_unlock_commit(buffer, event); 1427 ring_buffer_unlock_commit(buffer, event);
1484 1428
1485 out_drop_count: 1429 out_drop_count:
1486 __this_cpu_dec(user_stack_count); 1430 __this_cpu_dec(user_stack_count);
@@ -1497,150 +1441,25 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1497 1441
1498#endif /* CONFIG_STACKTRACE */ 1442#endif /* CONFIG_STACKTRACE */
1499 1443
1500/* created for use with alloc_percpu */
1501struct trace_buffer_struct {
1502 char buffer[TRACE_BUF_SIZE];
1503};
1504
1505static struct trace_buffer_struct *trace_percpu_buffer;
1506static struct trace_buffer_struct *trace_percpu_sirq_buffer;
1507static struct trace_buffer_struct *trace_percpu_irq_buffer;
1508static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1509
1510/*
1511 * The buffer used is dependent on the context. There is a per cpu
1512 * buffer for normal context, softirq contex, hard irq context and
1513 * for NMI context. Thise allows for lockless recording.
1514 *
1515 * Note, if the buffers failed to be allocated, then this returns NULL
1516 */
1517static char *get_trace_buf(void)
1518{
1519 struct trace_buffer_struct *percpu_buffer;
1520 struct trace_buffer_struct *buffer;
1521
1522 /*
1523 * If we have allocated per cpu buffers, then we do not
1524 * need to do any locking.
1525 */
1526 if (in_nmi())
1527 percpu_buffer = trace_percpu_nmi_buffer;
1528 else if (in_irq())
1529 percpu_buffer = trace_percpu_irq_buffer;
1530 else if (in_softirq())
1531 percpu_buffer = trace_percpu_sirq_buffer;
1532 else
1533 percpu_buffer = trace_percpu_buffer;
1534
1535 if (!percpu_buffer)
1536 return NULL;
1537
1538 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
1539
1540 return buffer->buffer;
1541}
1542
1543static int alloc_percpu_trace_buffer(void)
1544{
1545 struct trace_buffer_struct *buffers;
1546 struct trace_buffer_struct *sirq_buffers;
1547 struct trace_buffer_struct *irq_buffers;
1548 struct trace_buffer_struct *nmi_buffers;
1549
1550 buffers = alloc_percpu(struct trace_buffer_struct);
1551 if (!buffers)
1552 goto err_warn;
1553
1554 sirq_buffers = alloc_percpu(struct trace_buffer_struct);
1555 if (!sirq_buffers)
1556 goto err_sirq;
1557
1558 irq_buffers = alloc_percpu(struct trace_buffer_struct);
1559 if (!irq_buffers)
1560 goto err_irq;
1561
1562 nmi_buffers = alloc_percpu(struct trace_buffer_struct);
1563 if (!nmi_buffers)
1564 goto err_nmi;
1565
1566 trace_percpu_buffer = buffers;
1567 trace_percpu_sirq_buffer = sirq_buffers;
1568 trace_percpu_irq_buffer = irq_buffers;
1569 trace_percpu_nmi_buffer = nmi_buffers;
1570
1571 return 0;
1572
1573 err_nmi:
1574 free_percpu(irq_buffers);
1575 err_irq:
1576 free_percpu(sirq_buffers);
1577 err_sirq:
1578 free_percpu(buffers);
1579 err_warn:
1580 WARN(1, "Could not allocate percpu trace_printk buffer");
1581 return -ENOMEM;
1582}
1583
1584static int buffers_allocated;
1585
1586void trace_printk_init_buffers(void)
1587{
1588 if (buffers_allocated)
1589 return;
1590
1591 if (alloc_percpu_trace_buffer())
1592 return;
1593
1594 pr_info("ftrace: Allocated trace_printk buffers\n");
1595
1596 /* Expand the buffers to set size */
1597 tracing_update_buffers();
1598
1599 buffers_allocated = 1;
1600
1601 /*
1602 * trace_printk_init_buffers() can be called by modules.
1603 * If that happens, then we need to start cmdline recording
1604 * directly here. If the global_trace.buffer is already
1605 * allocated here, then this was called by module code.
1606 */
1607 if (global_trace.buffer)
1608 tracing_start_cmdline_record();
1609}
1610
1611void trace_printk_start_comm(void)
1612{
1613 /* Start tracing comms if trace printk is set */
1614 if (!buffers_allocated)
1615 return;
1616 tracing_start_cmdline_record();
1617}
1618
1619static void trace_printk_start_stop_comm(int enabled)
1620{
1621 if (!buffers_allocated)
1622 return;
1623
1624 if (enabled)
1625 tracing_start_cmdline_record();
1626 else
1627 tracing_stop_cmdline_record();
1628}
1629
1630/** 1444/**
1631 * trace_vbprintk - write binary msg to tracing buffer 1445 * trace_vbprintk - write binary msg to tracing buffer
1632 * 1446 *
1633 */ 1447 */
1634int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1448int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1635{ 1449{
1450 static arch_spinlock_t trace_buf_lock =
1451 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1452 static u32 trace_buf[TRACE_BUF_SIZE];
1453
1636 struct ftrace_event_call *call = &event_bprint; 1454 struct ftrace_event_call *call = &event_bprint;
1637 struct ring_buffer_event *event; 1455 struct ring_buffer_event *event;
1638 struct ring_buffer *buffer; 1456 struct ring_buffer *buffer;
1639 struct trace_array *tr = &global_trace; 1457 struct trace_array *tr = &global_trace;
1458 struct trace_array_cpu *data;
1640 struct bprint_entry *entry; 1459 struct bprint_entry *entry;
1641 unsigned long flags; 1460 unsigned long flags;
1642 char *tbuffer; 1461 int disable;
1643 int len = 0, size, pc; 1462 int cpu, len = 0, size, pc;
1644 1463
1645 if (unlikely(tracing_selftest_running || tracing_disabled)) 1464 if (unlikely(tracing_selftest_running || tracing_disabled))
1646 return 0; 1465 return 0;
@@ -1650,36 +1469,43 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1650 1469
1651 pc = preempt_count(); 1470 pc = preempt_count();
1652 preempt_disable_notrace(); 1471 preempt_disable_notrace();
1472 cpu = raw_smp_processor_id();
1473 data = tr->data[cpu];
1653 1474
1654 tbuffer = get_trace_buf(); 1475 disable = atomic_inc_return(&data->disabled);
1655 if (!tbuffer) { 1476 if (unlikely(disable != 1))
1656 len = 0;
1657 goto out; 1477 goto out;
1658 }
1659 1478
1660 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); 1479 /* Lockdep uses trace_printk for lock tracing */
1480 local_irq_save(flags);
1481 arch_spin_lock(&trace_buf_lock);
1482 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1661 1483
1662 if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) 1484 if (len > TRACE_BUF_SIZE || len < 0)
1663 goto out; 1485 goto out_unlock;
1664 1486
1665 local_save_flags(flags);
1666 size = sizeof(*entry) + sizeof(u32) * len; 1487 size = sizeof(*entry) + sizeof(u32) * len;
1667 buffer = tr->buffer; 1488 buffer = tr->buffer;
1668 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1489 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1669 flags, pc); 1490 flags, pc);
1670 if (!event) 1491 if (!event)
1671 goto out; 1492 goto out_unlock;
1672 entry = ring_buffer_event_data(event); 1493 entry = ring_buffer_event_data(event);
1673 entry->ip = ip; 1494 entry->ip = ip;
1674 entry->fmt = fmt; 1495 entry->fmt = fmt;
1675 1496
1676 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 1497 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1677 if (!filter_check_discard(call, entry, buffer, event)) { 1498 if (!filter_check_discard(call, entry, buffer, event)) {
1678 __buffer_unlock_commit(buffer, event); 1499 ring_buffer_unlock_commit(buffer, event);
1679 ftrace_trace_stack(buffer, flags, 6, pc); 1500 ftrace_trace_stack(buffer, flags, 6, pc);
1680 } 1501 }
1681 1502
1503out_unlock:
1504 arch_spin_unlock(&trace_buf_lock);
1505 local_irq_restore(flags);
1506
1682out: 1507out:
1508 atomic_dec_return(&data->disabled);
1683 preempt_enable_notrace(); 1509 preempt_enable_notrace();
1684 unpause_graph_tracing(); 1510 unpause_graph_tracing();
1685 1511
@@ -1705,53 +1531,58 @@ int trace_array_printk(struct trace_array *tr,
1705int trace_array_vprintk(struct trace_array *tr, 1531int trace_array_vprintk(struct trace_array *tr,
1706 unsigned long ip, const char *fmt, va_list args) 1532 unsigned long ip, const char *fmt, va_list args)
1707{ 1533{
1534 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1535 static char trace_buf[TRACE_BUF_SIZE];
1536
1708 struct ftrace_event_call *call = &event_print; 1537 struct ftrace_event_call *call = &event_print;
1709 struct ring_buffer_event *event; 1538 struct ring_buffer_event *event;
1710 struct ring_buffer *buffer; 1539 struct ring_buffer *buffer;
1711 int len = 0, size, pc; 1540 struct trace_array_cpu *data;
1541 int cpu, len = 0, size, pc;
1712 struct print_entry *entry; 1542 struct print_entry *entry;
1713 unsigned long flags; 1543 unsigned long irq_flags;
1714 char *tbuffer; 1544 int disable;
1715 1545
1716 if (tracing_disabled || tracing_selftest_running) 1546 if (tracing_disabled || tracing_selftest_running)
1717 return 0; 1547 return 0;
1718 1548
1719 /* Don't pollute graph traces with trace_vprintk internals */
1720 pause_graph_tracing();
1721
1722 pc = preempt_count(); 1549 pc = preempt_count();
1723 preempt_disable_notrace(); 1550 preempt_disable_notrace();
1551 cpu = raw_smp_processor_id();
1552 data = tr->data[cpu];
1724 1553
1725 1554 disable = atomic_inc_return(&data->disabled);
1726 tbuffer = get_trace_buf(); 1555 if (unlikely(disable != 1))
1727 if (!tbuffer) {
1728 len = 0;
1729 goto out; 1556 goto out;
1730 }
1731 1557
1732 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 1558 pause_graph_tracing();
1733 if (len > TRACE_BUF_SIZE) 1559 raw_local_irq_save(irq_flags);
1734 goto out; 1560 arch_spin_lock(&trace_buf_lock);
1561 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1735 1562
1736 local_save_flags(flags);
1737 size = sizeof(*entry) + len + 1; 1563 size = sizeof(*entry) + len + 1;
1738 buffer = tr->buffer; 1564 buffer = tr->buffer;
1739 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1565 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1740 flags, pc); 1566 irq_flags, pc);
1741 if (!event) 1567 if (!event)
1742 goto out; 1568 goto out_unlock;
1743 entry = ring_buffer_event_data(event); 1569 entry = ring_buffer_event_data(event);
1744 entry->ip = ip; 1570 entry->ip = ip;
1745 1571
1746 memcpy(&entry->buf, tbuffer, len); 1572 memcpy(&entry->buf, trace_buf, len);
1747 entry->buf[len] = '\0'; 1573 entry->buf[len] = '\0';
1748 if (!filter_check_discard(call, entry, buffer, event)) { 1574 if (!filter_check_discard(call, entry, buffer, event)) {
1749 __buffer_unlock_commit(buffer, event); 1575 ring_buffer_unlock_commit(buffer, event);
1750 ftrace_trace_stack(buffer, flags, 6, pc); 1576 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1751 } 1577 }
1578
1579 out_unlock:
1580 arch_spin_unlock(&trace_buf_lock);
1581 raw_local_irq_restore(irq_flags);
1582 unpause_graph_tracing();
1752 out: 1583 out:
1584 atomic_dec_return(&data->disabled);
1753 preempt_enable_notrace(); 1585 preempt_enable_notrace();
1754 unpause_graph_tracing();
1755 1586
1756 return len; 1587 return len;
1757} 1588}
@@ -1764,11 +1595,14 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1764 1595
1765static void trace_iterator_increment(struct trace_iterator *iter) 1596static void trace_iterator_increment(struct trace_iterator *iter)
1766{ 1597{
1767 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); 1598 /* Don't allow ftrace to trace into the ring buffers */
1599 ftrace_disable_cpu();
1768 1600
1769 iter->idx++; 1601 iter->idx++;
1770 if (buf_iter) 1602 if (iter->buffer_iter[iter->cpu])
1771 ring_buffer_read(buf_iter, NULL); 1603 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1604
1605 ftrace_enable_cpu();
1772} 1606}
1773 1607
1774static struct trace_entry * 1608static struct trace_entry *
@@ -1776,7 +1610,10 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1776 unsigned long *lost_events) 1610 unsigned long *lost_events)
1777{ 1611{
1778 struct ring_buffer_event *event; 1612 struct ring_buffer_event *event;
1779 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); 1613 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1614
1615 /* Don't allow ftrace to trace into the ring buffers */
1616 ftrace_disable_cpu();
1780 1617
1781 if (buf_iter) 1618 if (buf_iter)
1782 event = ring_buffer_iter_peek(buf_iter, ts); 1619 event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1784,6 +1621,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1784 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 1621 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1785 lost_events); 1622 lost_events);
1786 1623
1624 ftrace_enable_cpu();
1625
1787 if (event) { 1626 if (event) {
1788 iter->ent_size = ring_buffer_event_length(event); 1627 iter->ent_size = ring_buffer_event_length(event);
1789 return ring_buffer_event_data(event); 1628 return ring_buffer_event_data(event);
@@ -1802,7 +1641,6 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1802 int cpu_file = iter->cpu_file; 1641 int cpu_file = iter->cpu_file;
1803 u64 next_ts = 0, ts; 1642 u64 next_ts = 0, ts;
1804 int next_cpu = -1; 1643 int next_cpu = -1;
1805 int next_size = 0;
1806 int cpu; 1644 int cpu;
1807 1645
1808 /* 1646 /*
@@ -1834,12 +1672,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1834 next_cpu = cpu; 1672 next_cpu = cpu;
1835 next_ts = ts; 1673 next_ts = ts;
1836 next_lost = lost_events; 1674 next_lost = lost_events;
1837 next_size = iter->ent_size;
1838 } 1675 }
1839 } 1676 }
1840 1677
1841 iter->ent_size = next_size;
1842
1843 if (ent_cpu) 1678 if (ent_cpu)
1844 *ent_cpu = next_cpu; 1679 *ent_cpu = next_cpu;
1845 1680
@@ -1873,8 +1708,11 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1873 1708
1874static void trace_consume(struct trace_iterator *iter) 1709static void trace_consume(struct trace_iterator *iter)
1875{ 1710{
1711 /* Don't allow ftrace to trace into the ring buffers */
1712 ftrace_disable_cpu();
1876 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 1713 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1877 &iter->lost_events); 1714 &iter->lost_events);
1715 ftrace_enable_cpu();
1878} 1716}
1879 1717
1880static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1718static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1914,10 +1752,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1914 1752
1915 tr->data[cpu]->skipped_entries = 0; 1753 tr->data[cpu]->skipped_entries = 0;
1916 1754
1917 buf_iter = trace_buffer_iter(iter, cpu); 1755 if (!iter->buffer_iter[cpu])
1918 if (!buf_iter)
1919 return; 1756 return;
1920 1757
1758 buf_iter = iter->buffer_iter[cpu];
1921 ring_buffer_iter_reset(buf_iter); 1759 ring_buffer_iter_reset(buf_iter);
1922 1760
1923 /* 1761 /*
@@ -1963,12 +1801,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1963 iter->cpu = 0; 1801 iter->cpu = 0;
1964 iter->idx = -1; 1802 iter->idx = -1;
1965 1803
1804 ftrace_disable_cpu();
1805
1966 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1806 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1967 for_each_tracing_cpu(cpu) 1807 for_each_tracing_cpu(cpu)
1968 tracing_iter_reset(iter, cpu); 1808 tracing_iter_reset(iter, cpu);
1969 } else 1809 } else
1970 tracing_iter_reset(iter, cpu_file); 1810 tracing_iter_reset(iter, cpu_file);
1971 1811
1812 ftrace_enable_cpu();
1813
1972 iter->leftover = 0; 1814 iter->leftover = 0;
1973 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1815 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1974 ; 1816 ;
@@ -2000,33 +1842,6 @@ static void s_stop(struct seq_file *m, void *p)
2000 trace_event_read_unlock(); 1842 trace_event_read_unlock();
2001} 1843}
2002 1844
2003static void
2004get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
2005{
2006 unsigned long count;
2007 int cpu;
2008
2009 *total = 0;
2010 *entries = 0;
2011
2012 for_each_tracing_cpu(cpu) {
2013 count = ring_buffer_entries_cpu(tr->buffer, cpu);
2014 /*
2015 * If this buffer has skipped entries, then we hold all
2016 * entries for the trace and we need to ignore the
2017 * ones before the time stamp.
2018 */
2019 if (tr->data[cpu]->skipped_entries) {
2020 count -= tr->data[cpu]->skipped_entries;
2021 /* total is the same as the entries */
2022 *total += count;
2023 } else
2024 *total += count +
2025 ring_buffer_overrun_cpu(tr->buffer, cpu);
2026 *entries += count;
2027 }
2028}
2029
2030static void print_lat_help_header(struct seq_file *m) 1845static void print_lat_help_header(struct seq_file *m)
2031{ 1846{
2032 seq_puts(m, "# _------=> CPU# \n"); 1847 seq_puts(m, "# _------=> CPU# \n");
@@ -2039,35 +1854,12 @@ static void print_lat_help_header(struct seq_file *m)
2039 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1854 seq_puts(m, "# \\ / ||||| \\ | / \n");
2040} 1855}
2041 1856
2042static void print_event_info(struct trace_array *tr, struct seq_file *m) 1857static void print_func_help_header(struct seq_file *m)
2043{
2044 unsigned long total;
2045 unsigned long entries;
2046
2047 get_total_entries(tr, &total, &entries);
2048 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
2049 entries, total, num_online_cpus());
2050 seq_puts(m, "#\n");
2051}
2052
2053static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
2054{ 1858{
2055 print_event_info(tr, m); 1859 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2056 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2057 seq_puts(m, "# | | | | |\n"); 1860 seq_puts(m, "# | | | | |\n");
2058} 1861}
2059 1862
2060static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
2061{
2062 print_event_info(tr, m);
2063 seq_puts(m, "# _-----=> irqs-off\n");
2064 seq_puts(m, "# / _----=> need-resched\n");
2065 seq_puts(m, "# | / _---=> hardirq/softirq\n");
2066 seq_puts(m, "# || / _--=> preempt-depth\n");
2067 seq_puts(m, "# ||| / delay\n");
2068 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
2069 seq_puts(m, "# | | | |||| | |\n");
2070}
2071 1863
2072void 1864void
2073print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1865print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -2076,14 +1868,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2076 struct trace_array *tr = iter->tr; 1868 struct trace_array *tr = iter->tr;
2077 struct trace_array_cpu *data = tr->data[tr->cpu]; 1869 struct trace_array_cpu *data = tr->data[tr->cpu];
2078 struct tracer *type = current_trace; 1870 struct tracer *type = current_trace;
2079 unsigned long entries; 1871 unsigned long entries = 0;
2080 unsigned long total; 1872 unsigned long total = 0;
1873 unsigned long count;
2081 const char *name = "preemption"; 1874 const char *name = "preemption";
1875 int cpu;
2082 1876
2083 if (type) 1877 if (type)
2084 name = type->name; 1878 name = type->name;
2085 1879
2086 get_total_entries(tr, &total, &entries); 1880
1881 for_each_tracing_cpu(cpu) {
1882 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1883 /*
1884 * If this buffer has skipped entries, then we hold all
1885 * entries for the trace and we need to ignore the
1886 * ones before the time stamp.
1887 */
1888 if (tr->data[cpu]->skipped_entries) {
1889 count -= tr->data[cpu]->skipped_entries;
1890 /* total is the same as the entries */
1891 total += count;
1892 } else
1893 total += count +
1894 ring_buffer_overrun_cpu(tr->buffer, cpu);
1895 entries += count;
1896 }
2087 1897
2088 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1898 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
2089 name, UTS_RELEASE); 1899 name, UTS_RELEASE);
@@ -2114,8 +1924,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2114 seq_puts(m, "# -----------------\n"); 1924 seq_puts(m, "# -----------------\n");
2115 seq_printf(m, "# | task: %.16s-%d " 1925 seq_printf(m, "# | task: %.16s-%d "
2116 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 1926 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
2117 data->comm, data->pid, 1927 data->comm, data->pid, data->uid, data->nice,
2118 from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
2119 data->policy, data->rt_priority); 1928 data->policy, data->rt_priority);
2120 seq_puts(m, "# -----------------\n"); 1929 seq_puts(m, "# -----------------\n");
2121 1930
@@ -2264,15 +2073,13 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2264 2073
2265int trace_empty(struct trace_iterator *iter) 2074int trace_empty(struct trace_iterator *iter)
2266{ 2075{
2267 struct ring_buffer_iter *buf_iter;
2268 int cpu; 2076 int cpu;
2269 2077
2270 /* If we are looking at one CPU buffer, only check that one */ 2078 /* If we are looking at one CPU buffer, only check that one */
2271 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2079 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
2272 cpu = iter->cpu_file; 2080 cpu = iter->cpu_file;
2273 buf_iter = trace_buffer_iter(iter, cpu); 2081 if (iter->buffer_iter[cpu]) {
2274 if (buf_iter) { 2082 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
2275 if (!ring_buffer_iter_empty(buf_iter))
2276 return 0; 2083 return 0;
2277 } else { 2084 } else {
2278 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2085 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2282,9 +2089,8 @@ int trace_empty(struct trace_iterator *iter)
2282 } 2089 }
2283 2090
2284 for_each_tracing_cpu(cpu) { 2091 for_each_tracing_cpu(cpu) {
2285 buf_iter = trace_buffer_iter(iter, cpu); 2092 if (iter->buffer_iter[cpu]) {
2286 if (buf_iter) { 2093 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
2287 if (!ring_buffer_iter_empty(buf_iter))
2288 return 0; 2094 return 0;
2289 } else { 2095 } else {
2290 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2096 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2333,21 +2139,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2333 return print_trace_fmt(iter); 2139 return print_trace_fmt(iter);
2334} 2140}
2335 2141
2336void trace_latency_header(struct seq_file *m)
2337{
2338 struct trace_iterator *iter = m->private;
2339
2340 /* print nothing if the buffers are empty */
2341 if (trace_empty(iter))
2342 return;
2343
2344 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2345 print_trace_header(m, iter);
2346
2347 if (!(trace_flags & TRACE_ITER_VERBOSE))
2348 print_lat_help_header(m);
2349}
2350
2351void trace_default_header(struct seq_file *m) 2142void trace_default_header(struct seq_file *m)
2352{ 2143{
2353 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
@@ -2363,23 +2154,11 @@ void trace_default_header(struct seq_file *m)
2363 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2154 if (!(trace_flags & TRACE_ITER_VERBOSE))
2364 print_lat_help_header(m); 2155 print_lat_help_header(m);
2365 } else { 2156 } else {
2366 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 2157 if (!(trace_flags & TRACE_ITER_VERBOSE))
2367 if (trace_flags & TRACE_ITER_IRQ_INFO) 2158 print_func_help_header(m);
2368 print_func_help_header_irq(iter->tr, m);
2369 else
2370 print_func_help_header(iter->tr, m);
2371 }
2372 } 2159 }
2373} 2160}
2374 2161
2375static void test_ftrace_alive(struct seq_file *m)
2376{
2377 if (!ftrace_is_dead())
2378 return;
2379 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
2380 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2381}
2382
2383static int s_show(struct seq_file *m, void *v) 2162static int s_show(struct seq_file *m, void *v)
2384{ 2163{
2385 struct trace_iterator *iter = v; 2164 struct trace_iterator *iter = v;
@@ -2389,7 +2168,6 @@ static int s_show(struct seq_file *m, void *v)
2389 if (iter->tr) { 2168 if (iter->tr) {
2390 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2169 seq_printf(m, "# tracer: %s\n", iter->trace->name);
2391 seq_puts(m, "#\n"); 2170 seq_puts(m, "#\n");
2392 test_ftrace_alive(m);
2393 } 2171 }
2394 if (iter->trace && iter->trace->print_header) 2172 if (iter->trace && iter->trace->print_header)
2395 iter->trace->print_header(m); 2173 iter->trace->print_header(m);
@@ -2433,21 +2211,18 @@ static struct trace_iterator *
2433__tracing_open(struct inode *inode, struct file *file) 2211__tracing_open(struct inode *inode, struct file *file)
2434{ 2212{
2435 long cpu_file = (long) inode->i_private; 2213 long cpu_file = (long) inode->i_private;
2214 void *fail_ret = ERR_PTR(-ENOMEM);
2436 struct trace_iterator *iter; 2215 struct trace_iterator *iter;
2437 int cpu; 2216 struct seq_file *m;
2217 int cpu, ret;
2438 2218
2439 if (tracing_disabled) 2219 if (tracing_disabled)
2440 return ERR_PTR(-ENODEV); 2220 return ERR_PTR(-ENODEV);
2441 2221
2442 iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); 2222 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2443 if (!iter) 2223 if (!iter)
2444 return ERR_PTR(-ENOMEM); 2224 return ERR_PTR(-ENOMEM);
2445 2225
2446 iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
2447 GFP_KERNEL);
2448 if (!iter->buffer_iter)
2449 goto release;
2450
2451 /* 2226 /*
2452 * We make a copy of the current tracer to avoid concurrent 2227 * We make a copy of the current tracer to avoid concurrent
2453 * changes on it while we are reading. 2228 * changes on it while we are reading.
@@ -2479,10 +2254,6 @@ __tracing_open(struct inode *inode, struct file *file)
2479 if (ring_buffer_overruns(iter->tr->buffer)) 2254 if (ring_buffer_overruns(iter->tr->buffer))
2480 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2255 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2481 2256
2482 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2483 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485
2486 /* stop the trace while dumping */ 2257 /* stop the trace while dumping */
2487 tracing_stop(); 2258 tracing_stop();
2488 2259
@@ -2505,17 +2276,32 @@ __tracing_open(struct inode *inode, struct file *file)
2505 tracing_iter_reset(iter, cpu); 2276 tracing_iter_reset(iter, cpu);
2506 } 2277 }
2507 2278
2279 ret = seq_open(file, &tracer_seq_ops);
2280 if (ret < 0) {
2281 fail_ret = ERR_PTR(ret);
2282 goto fail_buffer;
2283 }
2284
2285 m = file->private_data;
2286 m->private = iter;
2287
2508 mutex_unlock(&trace_types_lock); 2288 mutex_unlock(&trace_types_lock);
2509 2289
2510 return iter; 2290 return iter;
2511 2291
2292 fail_buffer:
2293 for_each_tracing_cpu(cpu) {
2294 if (iter->buffer_iter[cpu])
2295 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2296 }
2297 free_cpumask_var(iter->started);
2298 tracing_start();
2512 fail: 2299 fail:
2513 mutex_unlock(&trace_types_lock); 2300 mutex_unlock(&trace_types_lock);
2514 kfree(iter->trace); 2301 kfree(iter->trace);
2515 kfree(iter->buffer_iter); 2302 kfree(iter);
2516release: 2303
2517 seq_release_private(inode, file); 2304 return fail_ret;
2518 return ERR_PTR(-ENOMEM);
2519} 2305}
2520 2306
2521int tracing_open_generic(struct inode *inode, struct file *filp) 2307int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2551,11 +2337,11 @@ static int tracing_release(struct inode *inode, struct file *file)
2551 tracing_start(); 2337 tracing_start();
2552 mutex_unlock(&trace_types_lock); 2338 mutex_unlock(&trace_types_lock);
2553 2339
2340 seq_release(inode, file);
2554 mutex_destroy(&iter->mutex); 2341 mutex_destroy(&iter->mutex);
2555 free_cpumask_var(iter->started); 2342 free_cpumask_var(iter->started);
2556 kfree(iter->trace); 2343 kfree(iter->trace);
2557 kfree(iter->buffer_iter); 2344 kfree(iter);
2558 seq_release_private(inode, file);
2559 return 0; 2345 return 0;
2560} 2346}
2561 2347
@@ -2741,12 +2527,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2741 if (cpumask_test_cpu(cpu, tracing_cpumask) && 2527 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2742 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2528 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2743 atomic_inc(&global_trace.data[cpu]->disabled); 2529 atomic_inc(&global_trace.data[cpu]->disabled);
2744 ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
2745 } 2530 }
2746 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 2531 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2747 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2532 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2748 atomic_dec(&global_trace.data[cpu]->disabled); 2533 atomic_dec(&global_trace.data[cpu]->disabled);
2749 ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
2750 } 2534 }
2751 } 2535 }
2752 arch_spin_unlock(&ftrace_max_lock); 2536 arch_spin_unlock(&ftrace_max_lock);
@@ -2851,19 +2635,26 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2851 2635
2852 if (mask == TRACE_ITER_OVERWRITE) 2636 if (mask == TRACE_ITER_OVERWRITE)
2853 ring_buffer_change_overwrite(global_trace.buffer, enabled); 2637 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2854
2855 if (mask == TRACE_ITER_PRINTK)
2856 trace_printk_start_stop_comm(enabled);
2857} 2638}
2858 2639
2859static int trace_set_options(char *option) 2640static ssize_t
2641tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2642 size_t cnt, loff_t *ppos)
2860{ 2643{
2644 char buf[64];
2861 char *cmp; 2645 char *cmp;
2862 int neg = 0; 2646 int neg = 0;
2863 int ret = 0; 2647 int ret;
2864 int i; 2648 int i;
2865 2649
2866 cmp = strstrip(option); 2650 if (cnt >= sizeof(buf))
2651 return -EINVAL;
2652
2653 if (copy_from_user(&buf, ubuf, cnt))
2654 return -EFAULT;
2655
2656 buf[cnt] = 0;
2657 cmp = strstrip(buf);
2867 2658
2868 if (strncmp(cmp, "no", 2) == 0) { 2659 if (strncmp(cmp, "no", 2) == 0) {
2869 neg = 1; 2660 neg = 1;
@@ -2882,27 +2673,10 @@ static int trace_set_options(char *option)
2882 mutex_lock(&trace_types_lock); 2673 mutex_lock(&trace_types_lock);
2883 ret = set_tracer_option(current_trace, cmp, neg); 2674 ret = set_tracer_option(current_trace, cmp, neg);
2884 mutex_unlock(&trace_types_lock); 2675 mutex_unlock(&trace_types_lock);
2676 if (ret)
2677 return ret;
2885 } 2678 }
2886 2679
2887 return ret;
2888}
2889
2890static ssize_t
2891tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2892 size_t cnt, loff_t *ppos)
2893{
2894 char buf[64];
2895
2896 if (cnt >= sizeof(buf))
2897 return -EINVAL;
2898
2899 if (copy_from_user(&buf, ubuf, cnt))
2900 return -EFAULT;
2901
2902 buf[cnt] = 0;
2903
2904 trace_set_options(buf);
2905
2906 *ppos += cnt; 2680 *ppos += cnt;
2907 2681
2908 return cnt; 2682 return cnt;
@@ -2927,18 +2701,18 @@ static const char readme_msg[] =
2927 "tracing mini-HOWTO:\n\n" 2701 "tracing mini-HOWTO:\n\n"
2928 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 2702 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2929 "# cat /sys/kernel/debug/tracing/available_tracers\n" 2703 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2930 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" 2704 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2931 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2705 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2932 "nop\n" 2706 "nop\n"
2933 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" 2707 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2934 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2708 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2935 "wakeup\n" 2709 "sched_switch\n"
2936 "# cat /sys/kernel/debug/tracing/trace_options\n" 2710 "# cat /sys/kernel/debug/tracing/trace_options\n"
2937 "noprint-parent nosym-offset nosym-addr noverbose\n" 2711 "noprint-parent nosym-offset nosym-addr noverbose\n"
2938 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2712 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2939 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" 2713 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2940 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 2714 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2941 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" 2715 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2942; 2716;
2943 2717
2944static ssize_t 2718static ssize_t
@@ -3007,6 +2781,56 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
3007}; 2781};
3008 2782
3009static ssize_t 2783static ssize_t
2784tracing_ctrl_read(struct file *filp, char __user *ubuf,
2785 size_t cnt, loff_t *ppos)
2786{
2787 char buf[64];
2788 int r;
2789
2790 r = sprintf(buf, "%u\n", tracer_enabled);
2791 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2792}
2793
2794static ssize_t
2795tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2796 size_t cnt, loff_t *ppos)
2797{
2798 struct trace_array *tr = filp->private_data;
2799 unsigned long val;
2800 int ret;
2801
2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2803 if (ret)
2804 return ret;
2805
2806 val = !!val;
2807
2808 mutex_lock(&trace_types_lock);
2809 if (tracer_enabled ^ val) {
2810
2811 /* Only need to warn if this is used to change the state */
2812 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2813
2814 if (val) {
2815 tracer_enabled = 1;
2816 if (current_trace->start)
2817 current_trace->start(tr);
2818 tracing_start();
2819 } else {
2820 tracer_enabled = 0;
2821 tracing_stop();
2822 if (current_trace->stop)
2823 current_trace->stop(tr);
2824 }
2825 }
2826 mutex_unlock(&trace_types_lock);
2827
2828 *ppos += cnt;
2829
2830 return cnt;
2831}
2832
2833static ssize_t
3010tracing_set_trace_read(struct file *filp, char __user *ubuf, 2834tracing_set_trace_read(struct file *filp, char __user *ubuf,
3011 size_t cnt, loff_t *ppos) 2835 size_t cnt, loff_t *ppos)
3012{ 2836{
@@ -3029,39 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
3029 return t->init(tr); 2853 return t->init(tr);
3030} 2854}
3031 2855
3032static void set_buffer_entries(struct trace_array *tr, unsigned long val) 2856static int __tracing_resize_ring_buffer(unsigned long size)
3033{
3034 int cpu;
3035 for_each_tracing_cpu(cpu)
3036 tr->data[cpu]->entries = val;
3037}
3038
3039/* resize @tr's buffer to the size of @size_tr's entries */
3040static int resize_buffer_duplicate_size(struct trace_array *tr,
3041 struct trace_array *size_tr, int cpu_id)
3042{
3043 int cpu, ret = 0;
3044
3045 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3046 for_each_tracing_cpu(cpu) {
3047 ret = ring_buffer_resize(tr->buffer,
3048 size_tr->data[cpu]->entries, cpu);
3049 if (ret < 0)
3050 break;
3051 tr->data[cpu]->entries = size_tr->data[cpu]->entries;
3052 }
3053 } else {
3054 ret = ring_buffer_resize(tr->buffer,
3055 size_tr->data[cpu_id]->entries, cpu_id);
3056 if (ret == 0)
3057 tr->data[cpu_id]->entries =
3058 size_tr->data[cpu_id]->entries;
3059 }
3060
3061 return ret;
3062}
3063
3064static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3065{ 2857{
3066 int ret; 2858 int ret;
3067 2859
@@ -3072,21 +2864,19 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3072 */ 2864 */
3073 ring_buffer_expanded = 1; 2865 ring_buffer_expanded = 1;
3074 2866
3075 /* May be called before buffers are initialized */ 2867 ret = ring_buffer_resize(global_trace.buffer, size);
3076 if (!global_trace.buffer)
3077 return 0;
3078
3079 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
3080 if (ret < 0) 2868 if (ret < 0)
3081 return ret; 2869 return ret;
3082 2870
3083 if (!current_trace->use_max_tr) 2871 if (!current_trace->use_max_tr)
3084 goto out; 2872 goto out;
3085 2873
3086 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 2874 ret = ring_buffer_resize(max_tr.buffer, size);
3087 if (ret < 0) { 2875 if (ret < 0) {
3088 int r = resize_buffer_duplicate_size(&global_trace, 2876 int r;
3089 &global_trace, cpu); 2877
2878 r = ring_buffer_resize(global_trace.buffer,
2879 global_trace.entries);
3090 if (r < 0) { 2880 if (r < 0) {
3091 /* 2881 /*
3092 * AARGH! We are left with different 2882 * AARGH! We are left with different
@@ -3108,39 +2898,43 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3108 return ret; 2898 return ret;
3109 } 2899 }
3110 2900
3111 if (cpu == RING_BUFFER_ALL_CPUS) 2901 max_tr.entries = size;
3112 set_buffer_entries(&max_tr, size);
3113 else
3114 max_tr.data[cpu]->entries = size;
3115
3116 out: 2902 out:
3117 if (cpu == RING_BUFFER_ALL_CPUS) 2903 global_trace.entries = size;
3118 set_buffer_entries(&global_trace, size);
3119 else
3120 global_trace.data[cpu]->entries = size;
3121 2904
3122 return ret; 2905 return ret;
3123} 2906}
3124 2907
3125static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
3126{ 2909{
3127 int ret = size; 2910 int cpu, ret = size;
3128 2911
3129 mutex_lock(&trace_types_lock); 2912 mutex_lock(&trace_types_lock);
3130 2913
3131 if (cpu_id != RING_BUFFER_ALL_CPUS) { 2914 tracing_stop();
3132 /* make sure, this cpu is enabled in the mask */ 2915
3133 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { 2916 /* disable all cpu buffers */
3134 ret = -EINVAL; 2917 for_each_tracing_cpu(cpu) {
3135 goto out; 2918 if (global_trace.data[cpu])
3136 } 2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
3137 } 2922 }
3138 2923
3139 ret = __tracing_resize_ring_buffer(size, cpu_id); 2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
3140 if (ret < 0) 2927 if (ret < 0)
3141 ret = -ENOMEM; 2928 ret = -ENOMEM;
3142 2929
3143out: 2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
3144 mutex_unlock(&trace_types_lock); 2938 mutex_unlock(&trace_types_lock);
3145 2939
3146 return ret; 2940 return ret;
@@ -3163,8 +2957,7 @@ int tracing_update_buffers(void)
3163 2957
3164 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
3165 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
3166 ret = __tracing_resize_ring_buffer(trace_buf_size, 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
3167 RING_BUFFER_ALL_CPUS);
3168 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
3169 2962
3170 return ret; 2963 return ret;
@@ -3188,8 +2981,7 @@ static int tracing_set_tracer(const char *buf)
3188 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
3189 2982
3190 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
3191 ret = __tracing_resize_ring_buffer(trace_buf_size, 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
3192 RING_BUFFER_ALL_CPUS);
3193 if (ret < 0) 2985 if (ret < 0)
3194 goto out; 2986 goto out;
3195 ret = 0; 2987 ret = 0;
@@ -3215,20 +3007,19 @@ static int tracing_set_tracer(const char *buf)
3215 * The max_tr ring buffer has some state (e.g. ring->clock) and 3007 * The max_tr ring buffer has some state (e.g. ring->clock) and
3216 * we want preserve it. 3008 * we want preserve it.
3217 */ 3009 */
3218 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3010 ring_buffer_resize(max_tr.buffer, 1);
3219 set_buffer_entries(&max_tr, 1); 3011 max_tr.entries = 1;
3220 } 3012 }
3221 destroy_trace_option_files(topts); 3013 destroy_trace_option_files(topts);
3222 3014
3223 current_trace = &nop_trace; 3015 current_trace = t;
3224 3016
3225 topts = create_trace_option_files(t); 3017 topts = create_trace_option_files(current_trace);
3226 if (t->use_max_tr) { 3018 if (current_trace->use_max_tr) {
3227 /* we need to make per cpu buffer sizes equivalent */ 3019 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
3228 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3229 RING_BUFFER_ALL_CPUS);
3230 if (ret < 0) 3020 if (ret < 0)
3231 goto out; 3021 goto out;
3022 max_tr.entries = global_trace.entries;
3232 } 3023 }
3233 3024
3234 if (t->init) { 3025 if (t->init) {
@@ -3237,7 +3028,6 @@ static int tracing_set_tracer(const char *buf)
3237 goto out; 3028 goto out;
3238 } 3029 }
3239 3030
3240 current_trace = t;
3241 trace_branch_enable(tr); 3031 trace_branch_enable(tr);
3242 out: 3032 out:
3243 mutex_unlock(&trace_types_lock); 3033 mutex_unlock(&trace_types_lock);
@@ -3350,10 +3140,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3350 if (trace_flags & TRACE_ITER_LATENCY_FMT) 3140 if (trace_flags & TRACE_ITER_LATENCY_FMT)
3351 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3141 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3352 3142
3353 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3354 if (trace_clocks[trace_clock_id].in_ns)
3355 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3356
3357 iter->cpu_file = cpu_file; 3143 iter->cpu_file = cpu_file;
3358 iter->tr = &global_trace; 3144 iter->tr = &global_trace;
3359 mutex_init(&iter->mutex); 3145 mutex_init(&iter->mutex);
@@ -3414,6 +3200,19 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3414 } 3200 }
3415} 3201}
3416 3202
3203
3204void default_wait_pipe(struct trace_iterator *iter)
3205{
3206 DEFINE_WAIT(wait);
3207
3208 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3209
3210 if (trace_empty(iter))
3211 schedule();
3212
3213 finish_wait(&trace_wait, &wait);
3214}
3215
3417/* 3216/*
3418 * This is a make-shift waitqueue. 3217 * This is a make-shift waitqueue.
3419 * A tracer might use this callback on some rare cases: 3218 * A tracer might use this callback on some rare cases:
@@ -3462,7 +3261,7 @@ static int tracing_wait_pipe(struct file *filp)
3462 * 3261 *
3463 * iter->pos will be 0 if we haven't read anything. 3262 * iter->pos will be 0 if we haven't read anything.
3464 */ 3263 */
3465 if (!tracing_is_enabled() && iter->pos) 3264 if (!tracer_enabled && iter->pos)
3466 break; 3265 break;
3467 } 3266 }
3468 3267
@@ -3643,7 +3442,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3643 .pages = pages_def, 3442 .pages = pages_def,
3644 .partial = partial_def, 3443 .partial = partial_def,
3645 .nr_pages = 0, /* This gets updated below. */ 3444 .nr_pages = 0, /* This gets updated below. */
3646 .nr_pages_max = PIPE_DEF_BUFFERS,
3647 .flags = flags, 3445 .flags = flags,
3648 .ops = &tracing_pipe_buf_ops, 3446 .ops = &tracing_pipe_buf_ops,
3649 .spd_release = tracing_spd_release_pipe, 3447 .spd_release = tracing_spd_release_pipe,
@@ -3715,7 +3513,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3715 3513
3716 ret = splice_to_pipe(pipe, &spd); 3514 ret = splice_to_pipe(pipe, &spd);
3717out: 3515out:
3718 splice_shrink_spd(&spd); 3516 splice_shrink_spd(pipe, &spd);
3719 return ret; 3517 return ret;
3720 3518
3721out_err: 3519out_err:
@@ -3723,82 +3521,30 @@ out_err:
3723 goto out; 3521 goto out;
3724} 3522}
3725 3523
3726struct ftrace_entries_info {
3727 struct trace_array *tr;
3728 int cpu;
3729};
3730
3731static int tracing_entries_open(struct inode *inode, struct file *filp)
3732{
3733 struct ftrace_entries_info *info;
3734
3735 if (tracing_disabled)
3736 return -ENODEV;
3737
3738 info = kzalloc(sizeof(*info), GFP_KERNEL);
3739 if (!info)
3740 return -ENOMEM;
3741
3742 info->tr = &global_trace;
3743 info->cpu = (unsigned long)inode->i_private;
3744
3745 filp->private_data = info;
3746
3747 return 0;
3748}
3749
3750static ssize_t 3524static ssize_t
3751tracing_entries_read(struct file *filp, char __user *ubuf, 3525tracing_entries_read(struct file *filp, char __user *ubuf,
3752 size_t cnt, loff_t *ppos) 3526 size_t cnt, loff_t *ppos)
3753{ 3527{
3754 struct ftrace_entries_info *info = filp->private_data; 3528 struct trace_array *tr = filp->private_data;
3755 struct trace_array *tr = info->tr; 3529 char buf[96];
3756 char buf[64]; 3530 int r;
3757 int r = 0;
3758 ssize_t ret;
3759 3531
3760 mutex_lock(&trace_types_lock); 3532 mutex_lock(&trace_types_lock);
3761 3533 if (!ring_buffer_expanded)
3762 if (info->cpu == RING_BUFFER_ALL_CPUS) { 3534 r = sprintf(buf, "%lu (expanded: %lu)\n",
3763 int cpu, buf_size_same; 3535 tr->entries >> 10,
3764 unsigned long size; 3536 trace_buf_size >> 10);
3765 3537 else
3766 size = 0; 3538 r = sprintf(buf, "%lu\n", tr->entries >> 10);
3767 buf_size_same = 1;
3768 /* check if all cpu sizes are same */
3769 for_each_tracing_cpu(cpu) {
3770 /* fill in the size from first enabled cpu */
3771 if (size == 0)
3772 size = tr->data[cpu]->entries;
3773 if (size != tr->data[cpu]->entries) {
3774 buf_size_same = 0;
3775 break;
3776 }
3777 }
3778
3779 if (buf_size_same) {
3780 if (!ring_buffer_expanded)
3781 r = sprintf(buf, "%lu (expanded: %lu)\n",
3782 size >> 10,
3783 trace_buf_size >> 10);
3784 else
3785 r = sprintf(buf, "%lu\n", size >> 10);
3786 } else
3787 r = sprintf(buf, "X\n");
3788 } else
3789 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
3790
3791 mutex_unlock(&trace_types_lock); 3539 mutex_unlock(&trace_types_lock);
3792 3540
3793 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3541 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3794 return ret;
3795} 3542}
3796 3543
3797static ssize_t 3544static ssize_t
3798tracing_entries_write(struct file *filp, const char __user *ubuf, 3545tracing_entries_write(struct file *filp, const char __user *ubuf,
3799 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3800{ 3547{
3801 struct ftrace_entries_info *info = filp->private_data;
3802 unsigned long val; 3548 unsigned long val;
3803 int ret; 3549 int ret;
3804 3550
@@ -3813,7 +3559,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3813 /* value is in KB */ 3559 /* value is in KB */
3814 val <<= 10; 3560 val <<= 10;
3815 3561
3816 ret = tracing_resize_ring_buffer(val, info->cpu); 3562 ret = tracing_resize_ring_buffer(val);
3817 if (ret < 0) 3563 if (ret < 0)
3818 return ret; 3564 return ret;
3819 3565
@@ -3822,40 +3568,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3822 return cnt; 3568 return cnt;
3823} 3569}
3824 3570
3825static int
3826tracing_entries_release(struct inode *inode, struct file *filp)
3827{
3828 struct ftrace_entries_info *info = filp->private_data;
3829
3830 kfree(info);
3831
3832 return 0;
3833}
3834
3835static ssize_t
3836tracing_total_entries_read(struct file *filp, char __user *ubuf,
3837 size_t cnt, loff_t *ppos)
3838{
3839 struct trace_array *tr = filp->private_data;
3840 char buf[64];
3841 int r, cpu;
3842 unsigned long size = 0, expanded_size = 0;
3843
3844 mutex_lock(&trace_types_lock);
3845 for_each_tracing_cpu(cpu) {
3846 size += tr->data[cpu]->entries >> 10;
3847 if (!ring_buffer_expanded)
3848 expanded_size += trace_buf_size >> 10;
3849 }
3850 if (ring_buffer_expanded)
3851 r = sprintf(buf, "%lu\n", size);
3852 else
3853 r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
3854 mutex_unlock(&trace_types_lock);
3855
3856 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3857}
3858
3859static ssize_t 3571static ssize_t
3860tracing_free_buffer_write(struct file *filp, const char __user *ubuf, 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3861 size_t cnt, loff_t *ppos) 3573 size_t cnt, loff_t *ppos)
@@ -3877,112 +3589,56 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3877 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3878 tracing_off(); 3590 tracing_off();
3879 /* resize the ring buffer to 0 */ 3591 /* resize the ring buffer to 0 */
3880 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); 3592 tracing_resize_ring_buffer(0);
3881 3593
3882 return 0; 3594 return 0;
3883} 3595}
3884 3596
3597static int mark_printk(const char *fmt, ...)
3598{
3599 int ret;
3600 va_list args;
3601 va_start(args, fmt);
3602 ret = trace_vprintk(0, fmt, args);
3603 va_end(args);
3604 return ret;
3605}
3606
3885static ssize_t 3607static ssize_t
3886tracing_mark_write(struct file *filp, const char __user *ubuf, 3608tracing_mark_write(struct file *filp, const char __user *ubuf,
3887 size_t cnt, loff_t *fpos) 3609 size_t cnt, loff_t *fpos)
3888{ 3610{
3889 unsigned long addr = (unsigned long)ubuf; 3611 char *buf;
3890 struct ring_buffer_event *event; 3612 size_t written;
3891 struct ring_buffer *buffer;
3892 struct print_entry *entry;
3893 unsigned long irq_flags;
3894 struct page *pages[2];
3895 void *map_page[2];
3896 int nr_pages = 1;
3897 ssize_t written;
3898 int offset;
3899 int size;
3900 int len;
3901 int ret;
3902 int i;
3903 3613
3904 if (tracing_disabled) 3614 if (tracing_disabled)
3905 return -EINVAL; 3615 return -EINVAL;
3906 3616
3907 if (!(trace_flags & TRACE_ITER_MARKERS))
3908 return -EINVAL;
3909
3910 if (cnt > TRACE_BUF_SIZE) 3617 if (cnt > TRACE_BUF_SIZE)
3911 cnt = TRACE_BUF_SIZE; 3618 cnt = TRACE_BUF_SIZE;
3912 3619
3913 /* 3620 buf = kmalloc(cnt + 2, GFP_KERNEL);
3914 * Userspace is injecting traces into the kernel trace buffer. 3621 if (buf == NULL)
3915 * We want to be as non intrusive as possible. 3622 return -ENOMEM;
3916 * To do so, we do not want to allocate any special buffers
3917 * or take any locks, but instead write the userspace data
3918 * straight into the ring buffer.
3919 *
3920 * First we need to pin the userspace buffer into memory,
3921 * which, most likely it is, because it just referenced it.
3922 * But there's no guarantee that it is. By using get_user_pages_fast()
3923 * and kmap_atomic/kunmap_atomic() we can get access to the
3924 * pages directly. We then write the data directly into the
3925 * ring buffer.
3926 */
3927 BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
3928
3929 /* check if we cross pages */
3930 if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
3931 nr_pages = 2;
3932
3933 offset = addr & (PAGE_SIZE - 1);
3934 addr &= PAGE_MASK;
3935
3936 ret = get_user_pages_fast(addr, nr_pages, 0, pages);
3937 if (ret < nr_pages) {
3938 while (--ret >= 0)
3939 put_page(pages[ret]);
3940 written = -EFAULT;
3941 goto out;
3942 }
3943
3944 for (i = 0; i < nr_pages; i++)
3945 map_page[i] = kmap_atomic(pages[i]);
3946 3623
3947 local_save_flags(irq_flags); 3624 if (copy_from_user(buf, ubuf, cnt)) {
3948 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 3625 kfree(buf);
3949 buffer = global_trace.buffer; 3626 return -EFAULT;
3950 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3951 irq_flags, preempt_count());
3952 if (!event) {
3953 /* Ring buffer disabled, return as if not open for write */
3954 written = -EBADF;
3955 goto out_unlock;
3956 } 3627 }
3957 3628 if (buf[cnt-1] != '\n') {
3958 entry = ring_buffer_event_data(event); 3629 buf[cnt] = '\n';
3959 entry->ip = _THIS_IP_; 3630 buf[cnt+1] = '\0';
3960
3961 if (nr_pages == 2) {
3962 len = PAGE_SIZE - offset;
3963 memcpy(&entry->buf, map_page[0] + offset, len);
3964 memcpy(&entry->buf[len], map_page[1], cnt - len);
3965 } else
3966 memcpy(&entry->buf, map_page[0] + offset, cnt);
3967
3968 if (entry->buf[cnt - 1] != '\n') {
3969 entry->buf[cnt] = '\n';
3970 entry->buf[cnt + 1] = '\0';
3971 } else 3631 } else
3972 entry->buf[cnt] = '\0'; 3632 buf[cnt] = '\0';
3973
3974 __buffer_unlock_commit(buffer, event);
3975
3976 written = cnt;
3977 3633
3634 written = mark_printk("%s", buf);
3635 kfree(buf);
3978 *fpos += written; 3636 *fpos += written;
3979 3637
3980 out_unlock: 3638 /* don't tell userspace we wrote more - it might confuse them */
3981 for (i = 0; i < nr_pages; i++){ 3639 if (written > cnt)
3982 kunmap_atomic(map_page[i]); 3640 written = cnt;
3983 put_page(pages[i]); 3641
3984 }
3985 out:
3986 return written; 3642 return written;
3987} 3643}
3988 3644
@@ -4032,14 +3688,6 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4032 if (max_tr.buffer) 3688 if (max_tr.buffer)
4033 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 3689 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
4034 3690
4035 /*
4036 * New clock may not be consistent with the previous clock.
4037 * Reset the buffer so that it doesn't have incomparable timestamps.
4038 */
4039 tracing_reset_online_cpus(&global_trace);
4040 if (max_tr.buffer)
4041 tracing_reset_online_cpus(&max_tr);
4042
4043 mutex_unlock(&trace_types_lock); 3691 mutex_unlock(&trace_types_lock);
4044 3692
4045 *fpos += cnt; 3693 *fpos += cnt;
@@ -4061,6 +3709,13 @@ static const struct file_operations tracing_max_lat_fops = {
4061 .llseek = generic_file_llseek, 3709 .llseek = generic_file_llseek,
4062}; 3710};
4063 3711
3712static const struct file_operations tracing_ctrl_fops = {
3713 .open = tracing_open_generic,
3714 .read = tracing_ctrl_read,
3715 .write = tracing_ctrl_write,
3716 .llseek = generic_file_llseek,
3717};
3718
4064static const struct file_operations set_tracer_fops = { 3719static const struct file_operations set_tracer_fops = {
4065 .open = tracing_open_generic, 3720 .open = tracing_open_generic,
4066 .read = tracing_set_trace_read, 3721 .read = tracing_set_trace_read,
@@ -4078,16 +3733,9 @@ static const struct file_operations tracing_pipe_fops = {
4078}; 3733};
4079 3734
4080static const struct file_operations tracing_entries_fops = { 3735static const struct file_operations tracing_entries_fops = {
4081 .open = tracing_entries_open, 3736 .open = tracing_open_generic,
4082 .read = tracing_entries_read, 3737 .read = tracing_entries_read,
4083 .write = tracing_entries_write, 3738 .write = tracing_entries_write,
4084 .release = tracing_entries_release,
4085 .llseek = generic_file_llseek,
4086};
4087
4088static const struct file_operations tracing_total_entries_fops = {
4089 .open = tracing_open_generic,
4090 .read = tracing_total_entries_read,
4091 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
4092}; 3740};
4093 3741
@@ -4217,6 +3865,12 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
4217 buf->private = 0; 3865 buf->private = 0;
4218} 3866}
4219 3867
3868static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
3869 struct pipe_buffer *buf)
3870{
3871 return 1;
3872}
3873
4220static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 3874static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
4221 struct pipe_buffer *buf) 3875 struct pipe_buffer *buf)
4222{ 3876{
@@ -4232,7 +3886,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
4232 .unmap = generic_pipe_buf_unmap, 3886 .unmap = generic_pipe_buf_unmap,
4233 .confirm = generic_pipe_buf_confirm, 3887 .confirm = generic_pipe_buf_confirm,
4234 .release = buffer_pipe_buf_release, 3888 .release = buffer_pipe_buf_release,
4235 .steal = generic_pipe_buf_steal, 3889 .steal = buffer_pipe_buf_steal,
4236 .get = buffer_pipe_buf_get, 3890 .get = buffer_pipe_buf_get,
4237}; 3891};
4238 3892
@@ -4264,7 +3918,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4264 struct splice_pipe_desc spd = { 3918 struct splice_pipe_desc spd = {
4265 .pages = pages_def, 3919 .pages = pages_def,
4266 .partial = partial_def, 3920 .partial = partial_def,
4267 .nr_pages_max = PIPE_DEF_BUFFERS,
4268 .flags = flags, 3921 .flags = flags,
4269 .ops = &buffer_pipe_buf_ops, 3922 .ops = &buffer_pipe_buf_ops,
4270 .spd_release = buffer_spd_release, 3923 .spd_release = buffer_spd_release,
@@ -4277,11 +3930,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4277 return -ENOMEM; 3930 return -ENOMEM;
4278 3931
4279 if (*ppos & (PAGE_SIZE - 1)) { 3932 if (*ppos & (PAGE_SIZE - 1)) {
3933 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
4280 ret = -EINVAL; 3934 ret = -EINVAL;
4281 goto out; 3935 goto out;
4282 } 3936 }
4283 3937
4284 if (len & (PAGE_SIZE - 1)) { 3938 if (len & (PAGE_SIZE - 1)) {
3939 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
4285 if (len < PAGE_SIZE) { 3940 if (len < PAGE_SIZE) {
4286 ret = -EINVAL; 3941 ret = -EINVAL;
4287 goto out; 3942 goto out;
@@ -4350,7 +4005,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4350 } 4005 }
4351 4006
4352 ret = splice_to_pipe(pipe, &spd); 4007 ret = splice_to_pipe(pipe, &spd);
4353 splice_shrink_spd(&spd); 4008 splice_shrink_spd(pipe, &spd);
4354out: 4009out:
4355 return ret; 4010 return ret;
4356} 4011}
@@ -4371,8 +4026,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4371 struct trace_array *tr = &global_trace; 4026 struct trace_array *tr = &global_trace;
4372 struct trace_seq *s; 4027 struct trace_seq *s;
4373 unsigned long cnt; 4028 unsigned long cnt;
4374 unsigned long long t;
4375 unsigned long usec_rem;
4376 4029
4377 s = kmalloc(sizeof(*s), GFP_KERNEL); 4030 s = kmalloc(sizeof(*s), GFP_KERNEL);
4378 if (!s) 4031 if (!s)
@@ -4389,31 +4042,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4389 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 4042 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
4390 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 4043 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4391 4044
4392 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4393 trace_seq_printf(s, "bytes: %ld\n", cnt);
4394
4395 if (trace_clocks[trace_clock_id].in_ns) {
4396 /* local or global for trace_clock */
4397 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4398 usec_rem = do_div(t, USEC_PER_SEC);
4399 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4400 t, usec_rem);
4401
4402 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4403 usec_rem = do_div(t, USEC_PER_SEC);
4404 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4405 } else {
4406 /* counter or tsc mode for trace_clock */
4407 trace_seq_printf(s, "oldest event ts: %llu\n",
4408 ring_buffer_oldest_event_ts(tr->buffer, cpu));
4409
4410 trace_seq_printf(s, "now ts: %llu\n",
4411 ring_buffer_time_stamp(tr->buffer, cpu));
4412 }
4413
4414 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4415 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4416
4417 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4045 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4418 4046
4419 kfree(s); 4047 kfree(s);
@@ -4520,9 +4148,6 @@ static void tracing_init_debugfs_percpu(long cpu)
4520 struct dentry *d_cpu; 4148 struct dentry *d_cpu;
4521 char cpu_dir[30]; /* 30 characters should be more than enough */ 4149 char cpu_dir[30]; /* 30 characters should be more than enough */
4522 4150
4523 if (!d_percpu)
4524 return;
4525
4526 snprintf(cpu_dir, 30, "cpu%ld", cpu); 4151 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4527 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4152 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4528 if (!d_cpu) { 4153 if (!d_cpu) {
@@ -4543,9 +4168,6 @@ static void tracing_init_debugfs_percpu(long cpu)
4543 4168
4544 trace_create_file("stats", 0444, d_cpu, 4169 trace_create_file("stats", 0444, d_cpu,
4545 (void *) cpu, &tracing_stats_fops); 4170 (void *) cpu, &tracing_stats_fops);
4546
4547 trace_create_file("buffer_size_kb", 0444, d_cpu,
4548 (void *) cpu, &tracing_entries_fops);
4549} 4171}
4550 4172
4551#ifdef CONFIG_FTRACE_SELFTEST 4173#ifdef CONFIG_FTRACE_SELFTEST
@@ -4655,7 +4277,7 @@ static const struct file_operations trace_options_core_fops = {
4655}; 4277};
4656 4278
4657struct dentry *trace_create_file(const char *name, 4279struct dentry *trace_create_file(const char *name,
4658 umode_t mode, 4280 mode_t mode,
4659 struct dentry *parent, 4281 struct dentry *parent,
4660 void *data, 4282 void *data,
4661 const struct file_operations *fops) 4283 const struct file_operations *fops)
@@ -4784,64 +4406,6 @@ static __init void create_trace_options_dir(void)
4784 create_trace_option_core_file(trace_options[i], i); 4406 create_trace_option_core_file(trace_options[i], i);
4785} 4407}
4786 4408
4787static ssize_t
4788rb_simple_read(struct file *filp, char __user *ubuf,
4789 size_t cnt, loff_t *ppos)
4790{
4791 struct trace_array *tr = filp->private_data;
4792 struct ring_buffer *buffer = tr->buffer;
4793 char buf[64];
4794 int r;
4795
4796 if (buffer)
4797 r = ring_buffer_record_is_on(buffer);
4798 else
4799 r = 0;
4800
4801 r = sprintf(buf, "%d\n", r);
4802
4803 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
4804}
4805
4806static ssize_t
4807rb_simple_write(struct file *filp, const char __user *ubuf,
4808 size_t cnt, loff_t *ppos)
4809{
4810 struct trace_array *tr = filp->private_data;
4811 struct ring_buffer *buffer = tr->buffer;
4812 unsigned long val;
4813 int ret;
4814
4815 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4816 if (ret)
4817 return ret;
4818
4819 if (buffer) {
4820 mutex_lock(&trace_types_lock);
4821 if (val) {
4822 ring_buffer_record_on(buffer);
4823 if (current_trace->start)
4824 current_trace->start(tr);
4825 } else {
4826 ring_buffer_record_off(buffer);
4827 if (current_trace->stop)
4828 current_trace->stop(tr);
4829 }
4830 mutex_unlock(&trace_types_lock);
4831 }
4832
4833 (*ppos)++;
4834
4835 return cnt;
4836}
4837
4838static const struct file_operations rb_simple_fops = {
4839 .open = tracing_open_generic,
4840 .read = rb_simple_read,
4841 .write = rb_simple_write,
4842 .llseek = default_llseek,
4843};
4844
4845static __init int tracer_init_debugfs(void) 4409static __init int tracer_init_debugfs(void)
4846{ 4410{
4847 struct dentry *d_tracer; 4411 struct dentry *d_tracer;
@@ -4851,6 +4415,9 @@ static __init int tracer_init_debugfs(void)
4851 4415
4852 d_tracer = tracing_init_dentry(); 4416 d_tracer = tracing_init_dentry();
4853 4417
4418 trace_create_file("tracing_enabled", 0644, d_tracer,
4419 &global_trace, &tracing_ctrl_fops);
4420
4854 trace_create_file("trace_options", 0644, d_tracer, 4421 trace_create_file("trace_options", 0644, d_tracer,
4855 NULL, &tracing_iter_fops); 4422 NULL, &tracing_iter_fops);
4856 4423
@@ -4881,10 +4448,7 @@ static __init int tracer_init_debugfs(void)
4881 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4448 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
4882 4449
4883 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4884 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4885
4886 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4887 &global_trace, &tracing_total_entries_fops);
4888 4452
4889 trace_create_file("free_buffer", 0644, d_tracer, 4453 trace_create_file("free_buffer", 0644, d_tracer,
4890 &global_trace, &tracing_free_buffer_fops); 4454 &global_trace, &tracing_free_buffer_fops);
@@ -4898,9 +4462,6 @@ static __init int tracer_init_debugfs(void)
4898 trace_create_file("trace_clock", 0644, d_tracer, NULL, 4462 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4899 &trace_clock_fops); 4463 &trace_clock_fops);
4900 4464
4901 trace_create_file("tracing_on", 0644, d_tracer,
4902 &global_trace, &rb_simple_fops);
4903
4904#ifdef CONFIG_DYNAMIC_FTRACE 4465#ifdef CONFIG_DYNAMIC_FTRACE
4905 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4466 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4906 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4467 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -5005,12 +4566,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5005 4566
5006 tracing_off(); 4567 tracing_off();
5007 4568
5008 /* Did function tracer already get disabled? */
5009 if (ftrace_is_dead()) {
5010 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
5011 printk("# MAY BE MISSING FUNCTION EVENTS\n");
5012 }
5013
5014 if (disable_tracing) 4569 if (disable_tracing)
5015 ftrace_kill(); 4570 ftrace_kill();
5016 4571
@@ -5073,7 +4628,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5073 if (ret != TRACE_TYPE_NO_CONSUME) 4628 if (ret != TRACE_TYPE_NO_CONSUME)
5074 trace_consume(&iter); 4629 trace_consume(&iter);
5075 } 4630 }
5076 touch_nmi_watchdog();
5077 4631
5078 trace_printk_seq(&iter.seq); 4632 trace_printk_seq(&iter.seq);
5079 } 4633 }
@@ -5104,7 +4658,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5104{ 4658{
5105 __ftrace_dump(true, oops_dump_mode); 4659 __ftrace_dump(true, oops_dump_mode);
5106} 4660}
5107EXPORT_SYMBOL_GPL(ftrace_dump);
5108 4661
5109__init static int tracer_alloc_buffers(void) 4662__init static int tracer_alloc_buffers(void)
5110{ 4663{
@@ -5120,11 +4673,6 @@ __init static int tracer_alloc_buffers(void)
5120 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4673 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
5121 goto out_free_buffer_mask; 4674 goto out_free_buffer_mask;
5122 4675
5123 /* Only allocate trace_printk buffers if a trace_printk exists */
5124 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5125 /* Must be called before global_trace.buffer is allocated */
5126 trace_printk_init_buffers();
5127
5128 /* To save memory, keep the ring buffer size to its minimum */ 4676 /* To save memory, keep the ring buffer size to its minimum */
5129 if (ring_buffer_expanded) 4677 if (ring_buffer_expanded)
5130 ring_buf_size = trace_buf_size; 4678 ring_buf_size = trace_buf_size;
@@ -5143,8 +4691,7 @@ __init static int tracer_alloc_buffers(void)
5143 WARN_ON(1); 4691 WARN_ON(1);
5144 goto out_free_cpumask; 4692 goto out_free_cpumask;
5145 } 4693 }
5146 if (global_trace.buffer_disabled) 4694 global_trace.entries = ring_buffer_size(global_trace.buffer);
5147 tracing_off();
5148 4695
5149 4696
5150#ifdef CONFIG_TRACER_MAX_TRACE 4697#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5155,6 +4702,7 @@ __init static int tracer_alloc_buffers(void)
5155 ring_buffer_free(global_trace.buffer); 4702 ring_buffer_free(global_trace.buffer);
5156 goto out_free_cpumask; 4703 goto out_free_cpumask;
5157 } 4704 }
4705 max_tr.entries = 1;
5158#endif 4706#endif
5159 4707
5160 /* Allocate the first page for all buffers */ 4708 /* Allocate the first page for all buffers */
@@ -5163,14 +4711,7 @@ __init static int tracer_alloc_buffers(void)
5163 max_tr.data[i] = &per_cpu(max_tr_data, i); 4711 max_tr.data[i] = &per_cpu(max_tr_data, i);
5164 } 4712 }
5165 4713
5166 set_buffer_entries(&global_trace,
5167 ring_buffer_size(global_trace.buffer, 0));
5168#ifdef CONFIG_TRACER_MAX_TRACE
5169 set_buffer_entries(&max_tr, 1);
5170#endif
5171
5172 trace_init_cmdlines(); 4714 trace_init_cmdlines();
5173 init_irq_work(&trace_work_wakeup, trace_wake_up);
5174 4715
5175 register_tracer(&nop_trace); 4716 register_tracer(&nop_trace);
5176 current_trace = &nop_trace; 4717 current_trace = &nop_trace;
@@ -5182,13 +4723,6 @@ __init static int tracer_alloc_buffers(void)
5182 4723
5183 register_die_notifier(&trace_die_notifier); 4724 register_die_notifier(&trace_die_notifier);
5184 4725
5185 while (trace_boot_options) {
5186 char *option;
5187
5188 option = strsep(&trace_boot_options, ",");
5189 trace_set_options(option);
5190 }
5191
5192 return 0; 4726 return 0;
5193 4727
5194out_free_cpumask: 4728out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902..616846bcfee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,23 +56,17 @@ enum trace_type {
56#define F_STRUCT(args...) args 56#define F_STRUCT(args...) args
57 57
58#undef FTRACE_ENTRY 58#undef FTRACE_ENTRY
59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
60 struct struct_name { \ 60 struct struct_name { \
61 struct trace_entry ent; \ 61 struct trace_entry ent; \
62 tstruct \ 62 tstruct \
63 } 63 }
64 64
65#undef TP_ARGS 65#undef TP_ARGS
66#define TP_ARGS(args...) args 66#define TP_ARGS(args...) args
67 67
68#undef FTRACE_ENTRY_DUP 68#undef FTRACE_ENTRY_DUP
69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) 69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
70
71#undef FTRACE_ENTRY_REG
72#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
73 filter, regfn) \
74 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
75 filter)
76 70
77#include "trace_entries.h" 71#include "trace_entries.h"
78 72
@@ -103,11 +97,6 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 97 unsigned long ret_ip;
104}; 98};
105 99
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
111/* 100/*
112 * trace_flag_type is an enumeration that holds different 101 * trace_flag_type is an enumeration that holds different
113 * states when a trace occurs. These are: 102 * states when a trace occurs. These are:
@@ -136,7 +125,6 @@ struct trace_array_cpu {
136 atomic_t disabled; 125 atomic_t disabled;
137 void *buffer_page; /* ring buffer spare */ 126 void *buffer_page; /* ring buffer spare */
138 127
139 unsigned long entries;
140 unsigned long saved_latency; 128 unsigned long saved_latency;
141 unsigned long critical_start; 129 unsigned long critical_start;
142 unsigned long critical_end; 130 unsigned long critical_end;
@@ -147,7 +135,7 @@ struct trace_array_cpu {
147 unsigned long skipped_entries; 135 unsigned long skipped_entries;
148 cycle_t preempt_timestamp; 136 cycle_t preempt_timestamp;
149 pid_t pid; 137 pid_t pid;
150 kuid_t uid; 138 uid_t uid;
151 char comm[TASK_COMM_LEN]; 139 char comm[TASK_COMM_LEN];
152}; 140};
153 141
@@ -158,8 +146,8 @@ struct trace_array_cpu {
158 */ 146 */
159struct trace_array { 147struct trace_array {
160 struct ring_buffer *buffer; 148 struct ring_buffer *buffer;
149 unsigned long entries;
161 int cpu; 150 int cpu;
162 int buffer_disabled;
163 cycle_t time_start; 151 cycle_t time_start;
164 struct task_struct *waiter; 152 struct task_struct *waiter;
165 struct trace_array_cpu *data[NR_CPUS]; 153 struct trace_array_cpu *data[NR_CPUS];
@@ -285,8 +273,8 @@ struct tracer {
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
286 struct tracer *next; 274 struct tracer *next;
287 struct tracer_flags *flags; 275 struct tracer_flags *flags;
288 bool print_max; 276 int print_max;
289 bool use_max_tr; 277 int use_max_tr;
290}; 278};
291 279
292 280
@@ -300,8 +288,6 @@ struct tracer {
300/* for function tracing recursion */ 288/* for function tracing recursion */
301#define TRACE_INTERNAL_BIT (1<<11) 289#define TRACE_INTERNAL_BIT (1<<11)
302#define TRACE_GLOBAL_BIT (1<<12) 290#define TRACE_GLOBAL_BIT (1<<12)
303#define TRACE_CONTROL_BIT (1<<13)
304
305/* 291/*
306 * Abuse of the trace_recursion. 292 * Abuse of the trace_recursion.
307 * As we need a way to maintain state if we are tracing the function 293 * As we need a way to maintain state if we are tracing the function
@@ -317,23 +303,16 @@ struct tracer {
317 303
318#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
319 305
320static inline struct ring_buffer_iter *
321trace_buffer_iter(struct trace_iterator *iter, int cpu)
322{
323 if (iter->buffer_iter && iter->buffer_iter[cpu])
324 return iter->buffer_iter[cpu];
325 return NULL;
326}
327
328int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
329int tracing_is_enabled(void); 307int tracing_is_enabled(void);
308void trace_wake_up(void);
330void tracing_reset(struct trace_array *tr, int cpu); 309void tracing_reset(struct trace_array *tr, int cpu);
331void tracing_reset_online_cpus(struct trace_array *tr); 310void tracing_reset_online_cpus(struct trace_array *tr);
332void tracing_reset_current(int cpu); 311void tracing_reset_current(int cpu);
333void tracing_reset_current_online_cpus(void); 312void tracing_reset_current_online_cpus(void);
334int tracing_open_generic(struct inode *inode, struct file *filp); 313int tracing_open_generic(struct inode *inode, struct file *filp);
335struct dentry *trace_create_file(const char *name, 314struct dentry *trace_create_file(const char *name,
336 umode_t mode, 315 mode_t mode,
337 struct dentry *parent, 316 struct dentry *parent,
338 void *data, 317 void *data,
339 const struct file_operations *fops); 318 const struct file_operations *fops);
@@ -348,6 +327,9 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
348 unsigned long len, 327 unsigned long len,
349 unsigned long flags, 328 unsigned long flags,
350 int pc); 329 int pc);
330void trace_buffer_unlock_commit(struct ring_buffer *buffer,
331 struct ring_buffer_event *event,
332 unsigned long flags, int pc);
351 333
352struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 334struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
353 struct trace_array_cpu *data); 335 struct trace_array_cpu *data);
@@ -355,9 +337,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
355struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 337struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
356 int *ent_cpu, u64 *ent_ts); 338 int *ent_cpu, u64 *ent_ts);
357 339
358void __buffer_unlock_commit(struct ring_buffer *buffer,
359 struct ring_buffer_event *event);
360
361int trace_empty(struct trace_iterator *iter); 340int trace_empty(struct trace_iterator *iter);
362 341
363void *trace_find_next_entry_inc(struct trace_iterator *iter); 342void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -366,6 +345,7 @@ void trace_init_global_iter(struct trace_iterator *iter);
366 345
367void tracing_iter_reset(struct trace_iterator *iter, int cpu); 346void tracing_iter_reset(struct trace_iterator *iter, int cpu);
368 347
348void default_wait_pipe(struct trace_iterator *iter);
369void poll_wait_pipe(struct trace_iterator *iter); 349void poll_wait_pipe(struct trace_iterator *iter);
370 350
371void ftrace(struct trace_array *tr, 351void ftrace(struct trace_array *tr,
@@ -390,7 +370,6 @@ void trace_graph_function(struct trace_array *tr,
390 unsigned long ip, 370 unsigned long ip,
391 unsigned long parent_ip, 371 unsigned long parent_ip,
392 unsigned long flags, int pc); 372 unsigned long flags, int pc);
393void trace_latency_header(struct seq_file *m);
394void trace_default_header(struct seq_file *m); 373void trace_default_header(struct seq_file *m);
395void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 374void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
396int trace_empty(struct trace_iterator *iter); 375int trace_empty(struct trace_iterator *iter);
@@ -405,7 +384,12 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
405void tracing_stop_sched_switch_record(void); 384void tracing_stop_sched_switch_record(void);
406void tracing_start_sched_switch_record(void); 385void tracing_start_sched_switch_record(void);
407int register_tracer(struct tracer *type); 386int register_tracer(struct tracer *type);
387void unregister_tracer(struct tracer *type);
408int is_tracing_stopped(void); 388int is_tracing_stopped(void);
389enum trace_file_type {
390 TRACE_FILE_LAT_FMT = 1,
391 TRACE_FILE_ANNOTATE = 2,
392};
409 393
410extern cpumask_var_t __read_mostly tracing_buffer_mask; 394extern cpumask_var_t __read_mostly tracing_buffer_mask;
411 395
@@ -465,11 +449,11 @@ extern void trace_find_cmdline(int pid, char comm[]);
465 449
466#ifdef CONFIG_DYNAMIC_FTRACE 450#ifdef CONFIG_DYNAMIC_FTRACE
467extern unsigned long ftrace_update_tot_cnt; 451extern unsigned long ftrace_update_tot_cnt;
468#endif
469#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 452#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
470extern int DYN_FTRACE_TEST_NAME(void); 453extern int DYN_FTRACE_TEST_NAME(void);
471#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 454#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
472extern int DYN_FTRACE_TEST_NAME2(void); 455extern int DYN_FTRACE_TEST_NAME2(void);
456#endif
473 457
474extern int ring_buffer_expanded; 458extern int ring_buffer_expanded;
475extern bool tracing_selftest_disabled; 459extern bool tracing_selftest_disabled;
@@ -595,17 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
595 579
596 return test_tsk_trace_trace(task); 580 return test_tsk_trace_trace(task);
597} 581}
598extern int ftrace_is_dead(void);
599#else 582#else
600static inline int ftrace_trace_task(struct task_struct *task) 583static inline int ftrace_trace_task(struct task_struct *task)
601{ 584{
602 return 1; 585 return 1;
603} 586}
604static inline int ftrace_is_dead(void) { return 0; }
605#endif 587#endif
606 588
607int ftrace_event_is_function(struct ftrace_event_call *call);
608
609/* 589/*
610 * struct trace_parser - servers for reading the user input separated by spaces 590 * struct trace_parser - servers for reading the user input separated by spaces
611 * @cont: set if the input is not complete - no final space char was found 591 * @cont: set if the input is not complete - no final space char was found
@@ -672,8 +652,6 @@ enum trace_iterator_flags {
672 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
673 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
674 TRACE_ITER_STOP_ON_FREE = 0x400000, 654 TRACE_ITER_STOP_ON_FREE = 0x400000,
675 TRACE_ITER_IRQ_INFO = 0x800000,
676 TRACE_ITER_MARKERS = 0x1000000,
677}; 655};
678 656
679/* 657/*
@@ -783,8 +761,16 @@ struct filter_pred {
783 filter_pred_fn_t fn; 761 filter_pred_fn_t fn;
784 u64 val; 762 u64 val;
785 struct regex regex; 763 struct regex regex;
786 unsigned short *ops; 764 /*
787 struct ftrace_event_field *field; 765 * Leaf nodes use field_name, ops is used by AND and OR
766 * nodes. The field_name is always freed when freeing a pred.
767 * We can overload field_name for ops and have it freed
768 * as well.
769 */
770 union {
771 char *field_name;
772 unsigned short *ops;
773 };
788 int offset; 774 int offset;
789 int not; 775 int not;
790 int op; 776 int op;
@@ -833,24 +819,13 @@ extern struct list_head ftrace_events;
833extern const char *__start___trace_bprintk_fmt[]; 819extern const char *__start___trace_bprintk_fmt[];
834extern const char *__stop___trace_bprintk_fmt[]; 820extern const char *__stop___trace_bprintk_fmt[];
835 821
836void trace_printk_init_buffers(void);
837void trace_printk_start_comm(void);
838
839#undef FTRACE_ENTRY 822#undef FTRACE_ENTRY
840#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 823#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
841 extern struct ftrace_event_call \ 824 extern struct ftrace_event_call \
842 __attribute__((__aligned__(4))) event_##call; 825 __attribute__((__aligned__(4))) event_##call;
843#undef FTRACE_ENTRY_DUP 826#undef FTRACE_ENTRY_DUP
844#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ 827#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
845 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ 828 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
846 filter)
847#include "trace_entries.h" 829#include "trace_entries.h"
848 830
849#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
850int perf_ftrace_event_register(struct ftrace_event_call *call,
851 enum trace_reg type, void *data);
852#else
853#define perf_ftrace_event_register NULL
854#endif
855
856#endif /* _LINUX_KERNEL_TRACE_H */ 831#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed2..8d3538b4ea5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
77 entry->correct = val == expect; 77 entry->correct = val == expect;
78 78
79 if (!filter_check_discard(call, entry, buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
80 __buffer_unlock_commit(buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
81 81
82 out: 82 out:
83 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
199 } 199 }
200 return register_tracer(&branch_trace); 200 return register_tracer(&branch_trace);
201} 201}
202core_initcall(init_branch_tracer); 202device_initcall(init_branch_tracer);
203 203
204#else 204#else
205static inline 205static inline
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cb..6302747a139 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,15 +113,3 @@ u64 notrace trace_clock_global(void)
113 113
114 return now; 114 return now;
115} 115}
116
117static atomic64_t trace_counter;
118
119/*
120 * trace_clock_counter(): simply an atomic counter.
121 * Use the trace_counter "counter" for cases where you do not care
122 * about timings, but are interested in strict ordering.
123 */
124u64 notrace trace_clock_counter(void)
125{
126 return atomic64_add_return(1, &trace_counter);
127}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca..93365907f21 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
55/* 55/*
56 * Function trace entry - function address and parent function address: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY_REG(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
60 TRACE_FN, 60 TRACE_FN,
61 61
@@ -64,11 +64,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
64 __field( unsigned long, parent_ip ) 64 __field( unsigned long, parent_ip )
65 ), 65 ),
66 66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), 67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68
69 FILTER_TRACE_FN,
70
71 perf_ftrace_event_register
72); 68);
73 69
74/* Function call entry */ 70/* Function call entry */
@@ -82,9 +78,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
82 __field_desc( int, graph_ent, depth ) 78 __field_desc( int, graph_ent, depth )
83 ), 79 ),
84 80
85 F_printk("--> %lx (%d)", __entry->func, __entry->depth), 81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
86
87 FILTER_OTHER
88); 82);
89 83
90/* Function return entry */ 84/* Function return entry */
@@ -104,9 +98,7 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
104 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", 98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
105 __entry->func, __entry->depth, 99 __entry->func, __entry->depth,
106 __entry->calltime, __entry->rettime, 100 __entry->calltime, __entry->rettime,
107 __entry->depth), 101 __entry->depth)
108
109 FILTER_OTHER
110); 102);
111 103
112/* 104/*
@@ -135,9 +127,8 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
135 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", 127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
136 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
137 __entry->next_pid, __entry->next_prio, __entry->next_state, 129 __entry->next_pid, __entry->next_prio, __entry->next_state,
138 __entry->next_cpu), 130 __entry->next_cpu
139 131 )
140 FILTER_OTHER
141); 132);
142 133
143/* 134/*
@@ -155,9 +146,8 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
155 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", 146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
156 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
157 __entry->next_pid, __entry->next_prio, __entry->next_state, 148 __entry->next_pid, __entry->next_prio, __entry->next_state,
158 __entry->next_cpu), 149 __entry->next_cpu
159 150 )
160 FILTER_OTHER
161); 151);
162 152
163/* 153/*
@@ -166,12 +156,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
166 156
167#define FTRACE_STACK_ENTRIES 8 157#define FTRACE_STACK_ENTRIES 8
168 158
169#ifndef CONFIG_64BIT
170# define IP_FMT "%08lx"
171#else
172# define IP_FMT "%016lx"
173#endif
174
175FTRACE_ENTRY(kernel_stack, stack_entry, 159FTRACE_ENTRY(kernel_stack, stack_entry,
176 160
177 TRACE_STACK, 161 TRACE_STACK,
@@ -181,14 +165,11 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
181 __dynamic_array(unsigned long, caller ) 165 __dynamic_array(unsigned long, caller )
182 ), 166 ),
183 167
184 F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" 169 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
187 __entry->caller[0], __entry->caller[1], __entry->caller[2], 170 __entry->caller[0], __entry->caller[1], __entry->caller[2],
188 __entry->caller[3], __entry->caller[4], __entry->caller[5], 171 __entry->caller[3], __entry->caller[4], __entry->caller[5],
189 __entry->caller[6], __entry->caller[7]), 172 __entry->caller[6], __entry->caller[7])
190
191 FILTER_OTHER
192); 173);
193 174
194FTRACE_ENTRY(user_stack, userstack_entry, 175FTRACE_ENTRY(user_stack, userstack_entry,
@@ -200,14 +181,11 @@ FTRACE_ENTRY(user_stack, userstack_entry,
200 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
201 ), 182 ),
202 183
203 F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" 184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
204 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" 185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
205 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
206 __entry->caller[0], __entry->caller[1], __entry->caller[2], 186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
207 __entry->caller[3], __entry->caller[4], __entry->caller[5], 187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
208 __entry->caller[6], __entry->caller[7]), 188 __entry->caller[6], __entry->caller[7])
209
210 FILTER_OTHER
211); 189);
212 190
213/* 191/*
@@ -224,9 +202,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
224 ), 202 ),
225 203
226 F_printk("%08lx fmt:%p", 204 F_printk("%08lx fmt:%p",
227 __entry->ip, __entry->fmt), 205 __entry->ip, __entry->fmt)
228
229 FILTER_OTHER
230); 206);
231 207
232FTRACE_ENTRY(print, print_entry, 208FTRACE_ENTRY(print, print_entry,
@@ -239,9 +215,7 @@ FTRACE_ENTRY(print, print_entry,
239 ), 215 ),
240 216
241 F_printk("%08lx %s", 217 F_printk("%08lx %s",
242 __entry->ip, __entry->buf), 218 __entry->ip, __entry->buf)
243
244 FILTER_OTHER
245); 219);
246 220
247FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, 221FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -260,9 +234,7 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
260 234
261 F_printk("%lx %lx %lx %d %x %x", 235 F_printk("%lx %lx %lx %d %x %x",
262 (unsigned long)__entry->phys, __entry->value, __entry->pc, 236 (unsigned long)__entry->phys, __entry->value, __entry->pc,
263 __entry->map_id, __entry->opcode, __entry->width), 237 __entry->map_id, __entry->opcode, __entry->width)
264
265 FILTER_OTHER
266); 238);
267 239
268FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, 240FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -280,9 +252,7 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
280 252
281 F_printk("%lx %lx %lx %d %x", 253 F_printk("%lx %lx %lx %d %x",
282 (unsigned long)__entry->phys, __entry->virt, __entry->len, 254 (unsigned long)__entry->phys, __entry->virt, __entry->len,
283 __entry->map_id, __entry->opcode), 255 __entry->map_id, __entry->opcode)
284
285 FILTER_OTHER
286); 256);
287 257
288 258
@@ -302,8 +272,6 @@ FTRACE_ENTRY(branch, trace_branch,
302 272
303 F_printk("%u:%s:%s (%u)", 273 F_printk("%u:%s:%s (%u)",
304 __entry->line, 274 __entry->line,
305 __entry->func, __entry->file, __entry->correct), 275 __entry->func, __entry->file, __entry->correct)
306
307 FILTER_OTHER
308); 276);
309 277
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045fab..19a359d5e6d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,11 +24,6 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
30 return -EPERM;
31
32 /* No tracing, just counting, so no obvious leak */ 27 /* No tracing, just counting, so no obvious leak */
33 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
34 return 0; 29 return 0;
@@ -49,17 +44,23 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
49 return 0; 44 return 0;
50} 45}
51 46
52static int perf_trace_event_reg(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
53 struct perf_event *p_event) 48 struct perf_event *p_event)
54{ 49{
55 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
56 int ret = -ENOMEM; 51 int ret;
57 int cpu; 52 int cpu;
58 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
59 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
60 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
61 return 0; 60 return 0;
62 61
62 ret = -ENOMEM;
63
63 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
64 if (!list) 65 if (!list)
65 goto fail; 66 goto fail;
@@ -82,7 +83,7 @@ static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
82 } 83 }
83 } 84 }
84 85
85 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); 86 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
86 if (ret) 87 if (ret)
87 goto fail; 88 goto fail;
88 89
@@ -107,69 +108,6 @@ fail:
107 return ret; 108 return ret;
108} 109}
109 110
110static void perf_trace_event_unreg(struct perf_event *p_event)
111{
112 struct ftrace_event_call *tp_event = p_event->tp_event;
113 int i;
114
115 if (--tp_event->perf_refcount > 0)
116 goto out;
117
118 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
119
120 /*
121 * Ensure our callback won't be called anymore. The buffers
122 * will be freed after that.
123 */
124 tracepoint_synchronize_unregister();
125
126 free_percpu(tp_event->perf_events);
127 tp_event->perf_events = NULL;
128
129 if (!--total_ref_count) {
130 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
131 free_percpu(perf_trace_buf[i]);
132 perf_trace_buf[i] = NULL;
133 }
134 }
135out:
136 module_put(tp_event->mod);
137}
138
139static int perf_trace_event_open(struct perf_event *p_event)
140{
141 struct ftrace_event_call *tp_event = p_event->tp_event;
142 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
143}
144
145static void perf_trace_event_close(struct perf_event *p_event)
146{
147 struct ftrace_event_call *tp_event = p_event->tp_event;
148 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
149}
150
151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
152 struct perf_event *p_event)
153{
154 int ret;
155
156 ret = perf_trace_event_perm(tp_event, p_event);
157 if (ret)
158 return ret;
159
160 ret = perf_trace_event_reg(tp_event, p_event);
161 if (ret)
162 return ret;
163
164 ret = perf_trace_event_open(p_event);
165 if (ret) {
166 perf_trace_event_unreg(p_event);
167 return ret;
168 }
169
170 return 0;
171}
172
173int perf_trace_init(struct perf_event *p_event) 111int perf_trace_init(struct perf_event *p_event)
174{ 112{
175 struct ftrace_event_call *tp_event; 113 struct ftrace_event_call *tp_event;
@@ -192,14 +130,6 @@ int perf_trace_init(struct perf_event *p_event)
192 return ret; 130 return ret;
193} 131}
194 132
195void perf_trace_destroy(struct perf_event *p_event)
196{
197 mutex_lock(&event_mutex);
198 perf_trace_event_close(p_event);
199 perf_trace_event_unreg(p_event);
200 mutex_unlock(&event_mutex);
201}
202
203int perf_trace_add(struct perf_event *p_event, int flags) 133int perf_trace_add(struct perf_event *p_event, int flags)
204{ 134{
205 struct ftrace_event_call *tp_event = p_event->tp_event; 135 struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -216,14 +146,43 @@ int perf_trace_add(struct perf_event *p_event, int flags)
216 list = this_cpu_ptr(pcpu_list); 146 list = this_cpu_ptr(pcpu_list);
217 hlist_add_head_rcu(&p_event->hlist_entry, list); 147 hlist_add_head_rcu(&p_event->hlist_entry, list);
218 148
219 return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); 149 return 0;
220} 150}
221 151
222void perf_trace_del(struct perf_event *p_event, int flags) 152void perf_trace_del(struct perf_event *p_event, int flags)
223{ 153{
224 struct ftrace_event_call *tp_event = p_event->tp_event;
225 hlist_del_rcu(&p_event->hlist_entry); 154 hlist_del_rcu(&p_event->hlist_entry);
226 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); 155}
156
157void perf_trace_destroy(struct perf_event *p_event)
158{
159 struct ftrace_event_call *tp_event = p_event->tp_event;
160 int i;
161
162 mutex_lock(&event_mutex);
163 if (--tp_event->perf_refcount > 0)
164 goto out;
165
166 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
167
168 /*
169 * Ensure our callback won't be called anymore. The buffers
170 * will be freed after that.
171 */
172 tracepoint_synchronize_unregister();
173
174 free_percpu(tp_event->perf_events);
175 tp_event->perf_events = NULL;
176
177 if (!--total_ref_count) {
178 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
179 free_percpu(perf_trace_buf[i]);
180 perf_trace_buf[i] = NULL;
181 }
182 }
183out:
184 module_put(tp_event->mod);
185 mutex_unlock(&event_mutex);
227} 186}
228 187
229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 188__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -255,87 +214,3 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
255 return raw_data; 214 return raw_data;
256} 215}
257EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 216EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
258
259#ifdef CONFIG_FUNCTION_TRACER
260static void
261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
262 struct ftrace_ops *ops, struct pt_regs *pt_regs)
263{
264 struct ftrace_entry *entry;
265 struct hlist_head *head;
266 struct pt_regs regs;
267 int rctx;
268
269#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
270 sizeof(u64)) - sizeof(u32))
271
272 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
273
274 perf_fetch_caller_regs(&regs);
275
276 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
277 if (!entry)
278 return;
279
280 entry->ip = ip;
281 entry->parent_ip = parent_ip;
282
283 head = this_cpu_ptr(event_function.perf_events);
284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
285 1, &regs, head, NULL);
286
287#undef ENTRY_SIZE
288}
289
290static int perf_ftrace_function_register(struct perf_event *event)
291{
292 struct ftrace_ops *ops = &event->ftrace_ops;
293
294 ops->flags |= FTRACE_OPS_FL_CONTROL;
295 ops->func = perf_ftrace_function_call;
296 return register_ftrace_function(ops);
297}
298
299static int perf_ftrace_function_unregister(struct perf_event *event)
300{
301 struct ftrace_ops *ops = &event->ftrace_ops;
302 int ret = unregister_ftrace_function(ops);
303 ftrace_free_filter(ops);
304 return ret;
305}
306
307static void perf_ftrace_function_enable(struct perf_event *event)
308{
309 ftrace_function_local_enable(&event->ftrace_ops);
310}
311
312static void perf_ftrace_function_disable(struct perf_event *event)
313{
314 ftrace_function_local_disable(&event->ftrace_ops);
315}
316
317int perf_ftrace_event_register(struct ftrace_event_call *call,
318 enum trace_reg type, void *data)
319{
320 switch (type) {
321 case TRACE_REG_REGISTER:
322 case TRACE_REG_UNREGISTER:
323 break;
324 case TRACE_REG_PERF_REGISTER:
325 case TRACE_REG_PERF_UNREGISTER:
326 return 0;
327 case TRACE_REG_PERF_OPEN:
328 return perf_ftrace_function_register(data);
329 case TRACE_REG_PERF_CLOSE:
330 return perf_ftrace_function_unregister(data);
331 case TRACE_REG_PERF_ADD:
332 perf_ftrace_function_enable(data);
333 return 0;
334 case TRACE_REG_PERF_DEL:
335 perf_ftrace_function_disable(data);
336 return 0;
337 }
338
339 return -EINVAL;
340}
341#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b94..c212a7f934e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,8 +147,7 @@ int trace_event_raw_init(struct ftrace_event_call *call)
147} 147}
148EXPORT_SYMBOL_GPL(trace_event_raw_init); 148EXPORT_SYMBOL_GPL(trace_event_raw_init);
149 149
150int ftrace_event_reg(struct ftrace_event_call *call, 150int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
151 enum trace_reg type, void *data)
152{ 151{
153 switch (type) { 152 switch (type) {
154 case TRACE_REG_REGISTER: 153 case TRACE_REG_REGISTER:
@@ -171,11 +170,6 @@ int ftrace_event_reg(struct ftrace_event_call *call,
171 call->class->perf_probe, 170 call->class->perf_probe,
172 call); 171 call);
173 return 0; 172 return 0;
174 case TRACE_REG_PERF_OPEN:
175 case TRACE_REG_PERF_CLOSE:
176 case TRACE_REG_PERF_ADD:
177 case TRACE_REG_PERF_DEL:
178 return 0;
179#endif 173#endif
180 } 174 }
181 return 0; 175 return 0;
@@ -215,7 +209,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
215 tracing_stop_cmdline_record(); 209 tracing_stop_cmdline_record();
216 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 210 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
217 } 211 }
218 call->class->reg(call, TRACE_REG_UNREGISTER, NULL); 212 call->class->reg(call, TRACE_REG_UNREGISTER);
219 } 213 }
220 break; 214 break;
221 case 1: 215 case 1:
@@ -224,7 +218,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
224 tracing_start_cmdline_record(); 218 tracing_start_cmdline_record();
225 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 219 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
226 } 220 }
227 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); 221 ret = call->class->reg(call, TRACE_REG_REGISTER);
228 if (ret) { 222 if (ret) {
229 tracing_stop_cmdline_record(); 223 tracing_stop_cmdline_record();
230 pr_info("event trace: Could not enable event " 224 pr_info("event trace: Could not enable event "
@@ -294,9 +288,6 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
294 if (!call->name || !call->class || !call->class->reg) 288 if (!call->name || !call->class || !call->class->reg)
295 continue; 289 continue;
296 290
297 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
298 continue;
299
300 if (match && 291 if (match &&
301 strcmp(match, call->name) != 0 && 292 strcmp(match, call->name) != 0 &&
302 strcmp(match, call->class->system) != 0) 293 strcmp(match, call->class->system) != 0)
@@ -491,6 +482,19 @@ static void t_stop(struct seq_file *m, void *p)
491 mutex_unlock(&event_mutex); 482 mutex_unlock(&event_mutex);
492} 483}
493 484
485static int
486ftrace_event_seq_open(struct inode *inode, struct file *file)
487{
488 const struct seq_operations *seq_ops;
489
490 if ((file->f_mode & FMODE_WRITE) &&
491 (file->f_flags & O_TRUNC))
492 ftrace_clear_events();
493
494 seq_ops = inode->i_private;
495 return seq_open(file, seq_ops);
496}
497
494static ssize_t 498static ssize_t
495event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 499event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
496 loff_t *ppos) 500 loff_t *ppos)
@@ -967,9 +971,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
967 return r; 971 return r;
968} 972}
969 973
970static int ftrace_event_avail_open(struct inode *inode, struct file *file);
971static int ftrace_event_set_open(struct inode *inode, struct file *file);
972
973static const struct seq_operations show_event_seq_ops = { 974static const struct seq_operations show_event_seq_ops = {
974 .start = t_start, 975 .start = t_start,
975 .next = t_next, 976 .next = t_next,
@@ -985,14 +986,14 @@ static const struct seq_operations show_set_event_seq_ops = {
985}; 986};
986 987
987static const struct file_operations ftrace_avail_fops = { 988static const struct file_operations ftrace_avail_fops = {
988 .open = ftrace_event_avail_open, 989 .open = ftrace_event_seq_open,
989 .read = seq_read, 990 .read = seq_read,
990 .llseek = seq_lseek, 991 .llseek = seq_lseek,
991 .release = seq_release, 992 .release = seq_release,
992}; 993};
993 994
994static const struct file_operations ftrace_set_event_fops = { 995static const struct file_operations ftrace_set_event_fops = {
995 .open = ftrace_event_set_open, 996 .open = ftrace_event_seq_open,
996 .read = seq_read, 997 .read = seq_read,
997 .write = ftrace_event_write, 998 .write = ftrace_event_write,
998 .llseek = seq_lseek, 999 .llseek = seq_lseek,
@@ -1068,26 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1068 return d_events; 1069 return d_events;
1069} 1070}
1070 1071
1071static int
1072ftrace_event_avail_open(struct inode *inode, struct file *file)
1073{
1074 const struct seq_operations *seq_ops = &show_event_seq_ops;
1075
1076 return seq_open(file, seq_ops);
1077}
1078
1079static int
1080ftrace_event_set_open(struct inode *inode, struct file *file)
1081{
1082 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1083
1084 if ((file->f_mode & FMODE_WRITE) &&
1085 (file->f_flags & O_TRUNC))
1086 ftrace_clear_events();
1087
1088 return seq_open(file, seq_ops);
1089}
1090
1091static struct dentry * 1072static struct dentry *
1092event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1093{ 1074{
@@ -1177,7 +1158,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1177 return -1; 1158 return -1;
1178 } 1159 }
1179 1160
1180 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1161 if (call->class->reg)
1181 trace_create_file("enable", 0644, call->dir, call, 1162 trace_create_file("enable", 0644, call->dir, call,
1182 enable); 1163 enable);
1183 1164
@@ -1209,31 +1190,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1209 return 0; 1190 return 0;
1210} 1191}
1211 1192
1212static void event_remove(struct ftrace_event_call *call)
1213{
1214 ftrace_event_enable_disable(call, 0);
1215 if (call->event.funcs)
1216 __unregister_ftrace_event(&call->event);
1217 list_del(&call->list);
1218}
1219
1220static int event_init(struct ftrace_event_call *call)
1221{
1222 int ret = 0;
1223
1224 if (WARN_ON(!call->name))
1225 return -EINVAL;
1226
1227 if (call->class->raw_init) {
1228 ret = call->class->raw_init(call);
1229 if (ret < 0 && ret != -ENOSYS)
1230 pr_warn("Could not initialize trace events/%s\n",
1231 call->name);
1232 }
1233
1234 return ret;
1235}
1236
1237static int 1193static int
1238__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1194__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1239 const struct file_operations *id, 1195 const struct file_operations *id,
@@ -1244,9 +1200,19 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1244 struct dentry *d_events; 1200 struct dentry *d_events;
1245 int ret; 1201 int ret;
1246 1202
1247 ret = event_init(call); 1203 /* The linker may leave blanks */
1248 if (ret < 0) 1204 if (!call->name)
1249 return ret; 1205 return -EINVAL;
1206
1207 if (call->class->raw_init) {
1208 ret = call->class->raw_init(call);
1209 if (ret < 0) {
1210 if (ret != -ENOSYS)
1211 pr_warning("Could not initialize trace events/%s\n",
1212 call->name);
1213 return ret;
1214 }
1215 }
1250 1216
1251 d_events = event_trace_events_dir(); 1217 d_events = event_trace_events_dir();
1252 if (!d_events) 1218 if (!d_events)
@@ -1297,10 +1263,13 @@ static void remove_subsystem_dir(const char *name)
1297 */ 1263 */
1298static void __trace_remove_event_call(struct ftrace_event_call *call) 1264static void __trace_remove_event_call(struct ftrace_event_call *call)
1299{ 1265{
1300 event_remove(call); 1266 ftrace_event_enable_disable(call, 0);
1267 if (call->event.funcs)
1268 __unregister_ftrace_event(&call->event);
1269 debugfs_remove_recursive(call->dir);
1270 list_del(&call->list);
1301 trace_destroy_fields(call); 1271 trace_destroy_fields(call);
1302 destroy_preds(call); 1272 destroy_preds(call);
1303 debugfs_remove_recursive(call->dir);
1304 remove_subsystem_dir(call->class->system); 1273 remove_subsystem_dir(call->class->system);
1305} 1274}
1306 1275
@@ -1472,59 +1441,30 @@ static __init int setup_trace_event(char *str)
1472} 1441}
1473__setup("trace_event=", setup_trace_event); 1442__setup("trace_event=", setup_trace_event);
1474 1443
1475static __init int event_trace_enable(void)
1476{
1477 struct ftrace_event_call **iter, *call;
1478 char *buf = bootup_event_buf;
1479 char *token;
1480 int ret;
1481
1482 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
1483
1484 call = *iter;
1485 ret = event_init(call);
1486 if (!ret)
1487 list_add(&call->list, &ftrace_events);
1488 }
1489
1490 while (true) {
1491 token = strsep(&buf, ",");
1492
1493 if (!token)
1494 break;
1495 if (!*token)
1496 continue;
1497
1498 ret = ftrace_set_clr_event(token, 1);
1499 if (ret)
1500 pr_warn("Failed to enable trace event: %s\n", token);
1501 }
1502
1503 trace_printk_start_comm();
1504
1505 return 0;
1506}
1507
1508static __init int event_trace_init(void) 1444static __init int event_trace_init(void)
1509{ 1445{
1510 struct ftrace_event_call *call; 1446 struct ftrace_event_call **call;
1511 struct dentry *d_tracer; 1447 struct dentry *d_tracer;
1512 struct dentry *entry; 1448 struct dentry *entry;
1513 struct dentry *d_events; 1449 struct dentry *d_events;
1514 int ret; 1450 int ret;
1451 char *buf = bootup_event_buf;
1452 char *token;
1515 1453
1516 d_tracer = tracing_init_dentry(); 1454 d_tracer = tracing_init_dentry();
1517 if (!d_tracer) 1455 if (!d_tracer)
1518 return 0; 1456 return 0;
1519 1457
1520 entry = debugfs_create_file("available_events", 0444, d_tracer, 1458 entry = debugfs_create_file("available_events", 0444, d_tracer,
1521 NULL, &ftrace_avail_fops); 1459 (void *)&show_event_seq_ops,
1460 &ftrace_avail_fops);
1522 if (!entry) 1461 if (!entry)
1523 pr_warning("Could not create debugfs " 1462 pr_warning("Could not create debugfs "
1524 "'available_events' entry\n"); 1463 "'available_events' entry\n");
1525 1464
1526 entry = debugfs_create_file("set_event", 0644, d_tracer, 1465 entry = debugfs_create_file("set_event", 0644, d_tracer,
1527 NULL, &ftrace_set_event_fops); 1466 (void *)&show_set_event_seq_ops,
1467 &ftrace_set_event_fops);
1528 if (!entry) 1468 if (!entry)
1529 pr_warning("Could not create debugfs " 1469 pr_warning("Could not create debugfs "
1530 "'set_event' entry\n"); 1470 "'set_event' entry\n");
@@ -1548,19 +1488,24 @@ static __init int event_trace_init(void)
1548 if (trace_define_common_fields()) 1488 if (trace_define_common_fields())
1549 pr_warning("tracing: Failed to allocate common fields"); 1489 pr_warning("tracing: Failed to allocate common fields");
1550 1490
1551 /* 1491 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1552 * Early initialization already enabled ftrace event. 1492 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
1553 * Now it's only necessary to create the event directory.
1554 */
1555 list_for_each_entry(call, &ftrace_events, list) {
1556
1557 ret = event_create_dir(call, d_events,
1558 &ftrace_event_id_fops,
1559 &ftrace_enable_fops, 1493 &ftrace_enable_fops,
1560 &ftrace_event_filter_fops, 1494 &ftrace_event_filter_fops,
1561 &ftrace_event_format_fops); 1495 &ftrace_event_format_fops);
1562 if (ret < 0) 1496 }
1563 event_remove(call); 1497
1498 while (true) {
1499 token = strsep(&buf, ",");
1500
1501 if (!token)
1502 break;
1503 if (!*token)
1504 continue;
1505
1506 ret = ftrace_set_clr_event(token, 1);
1507 if (ret)
1508 pr_warning("Failed to enable trace event: %s\n", token);
1564 } 1509 }
1565 1510
1566 ret = register_module_notifier(&trace_module_nb); 1511 ret = register_module_notifier(&trace_module_nb);
@@ -1569,7 +1514,6 @@ static __init int event_trace_init(void)
1569 1514
1570 return 0; 1515 return 0;
1571} 1516}
1572core_initcall(event_trace_enable);
1573fs_initcall(event_trace_init); 1517fs_initcall(event_trace_init);
1574 1518
1575#ifdef CONFIG_FTRACE_STARTUP_TEST 1519#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1693,11 +1637,9 @@ static __init void event_trace_self_tests(void)
1693 event_test_stuff(); 1637 event_test_stuff();
1694 1638
1695 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 1639 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
1696 if (WARN_ON_ONCE(ret)) { 1640 if (WARN_ON_ONCE(ret))
1697 pr_warning("error disabling system %s\n", 1641 pr_warning("error disabling system %s\n",
1698 system->name); 1642 system->name);
1699 continue;
1700 }
1701 1643
1702 pr_cont("OK\n"); 1644 pr_cont("OK\n");
1703 } 1645 }
@@ -1730,8 +1672,7 @@ static __init void event_trace_self_tests(void)
1730static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); 1672static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1731 1673
1732static void 1674static void
1733function_test_events_call(unsigned long ip, unsigned long parent_ip, 1675function_test_events_call(unsigned long ip, unsigned long parent_ip)
1734 struct ftrace_ops *op, struct pt_regs *pt_regs)
1735{ 1676{
1736 struct ring_buffer_event *event; 1677 struct ring_buffer_event *event;
1737 struct ring_buffer *buffer; 1678 struct ring_buffer *buffer;
@@ -1760,7 +1701,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
1760 entry->ip = ip; 1701 entry->ip = ip;
1761 entry->parent_ip = parent_ip; 1702 entry->parent_ip = parent_ip;
1762 1703
1763 trace_buffer_unlock_commit(buffer, event, flags, pc); 1704 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1764 1705
1765 out: 1706 out:
1766 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1707 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1770,7 +1711,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
1770static struct ftrace_ops trace_ops __initdata = 1711static struct ftrace_ops trace_ops __initdata =
1771{ 1712{
1772 .func = function_test_events_call, 1713 .func = function_test_events_call,
1773 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
1774}; 1714};
1775 1715
1776static __init void event_trace_self_test_with_function(void) 1716static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4..bd3c6369f80 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,12 +27,6 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
36enum filter_op_ids 30enum filter_op_ids
37{ 31{
38 OP_OR, 32 OP_OR,
@@ -81,7 +75,6 @@ enum {
81 FILT_ERR_TOO_MANY_PREDS, 75 FILT_ERR_TOO_MANY_PREDS,
82 FILT_ERR_MISSING_FIELD, 76 FILT_ERR_MISSING_FIELD,
83 FILT_ERR_INVALID_FILTER, 77 FILT_ERR_INVALID_FILTER,
84 FILT_ERR_IP_FIELD_ONLY,
85}; 78};
86 79
87static char *err_text[] = { 80static char *err_text[] = {
@@ -97,7 +90,6 @@ static char *err_text[] = {
97 "Too many terms in predicate expression", 90 "Too many terms in predicate expression",
98 "Missing field name and/or value", 91 "Missing field name and/or value",
99 "Meaningless filter expression", 92 "Meaningless filter expression",
100 "Only 'ip' field is supported for function trace",
101}; 93};
102 94
103struct opstack_op { 95struct opstack_op {
@@ -389,63 +381,6 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
389 return pred; 381 return pred;
390} 382}
391 383
392enum walk_return {
393 WALK_PRED_ABORT,
394 WALK_PRED_PARENT,
395 WALK_PRED_DEFAULT,
396};
397
398typedef int (*filter_pred_walkcb_t) (enum move_type move,
399 struct filter_pred *pred,
400 int *err, void *data);
401
402static int walk_pred_tree(struct filter_pred *preds,
403 struct filter_pred *root,
404 filter_pred_walkcb_t cb, void *data)
405{
406 struct filter_pred *pred = root;
407 enum move_type move = MOVE_DOWN;
408 int done = 0;
409
410 if (!preds)
411 return -EINVAL;
412
413 do {
414 int err = 0, ret;
415
416 ret = cb(move, pred, &err, data);
417 if (ret == WALK_PRED_ABORT)
418 return err;
419 if (ret == WALK_PRED_PARENT)
420 goto get_parent;
421
422 switch (move) {
423 case MOVE_DOWN:
424 if (pred->left != FILTER_PRED_INVALID) {
425 pred = &preds[pred->left];
426 continue;
427 }
428 goto get_parent;
429 case MOVE_UP_FROM_LEFT:
430 pred = &preds[pred->right];
431 move = MOVE_DOWN;
432 continue;
433 case MOVE_UP_FROM_RIGHT:
434 get_parent:
435 if (pred == root)
436 break;
437 pred = get_pred_parent(pred, preds,
438 pred->parent,
439 &move);
440 continue;
441 }
442 done = 1;
443 } while (!done);
444
445 /* We are fine. */
446 return 0;
447}
448
449/* 384/*
450 * A series of AND or ORs where found together. Instead of 385 * A series of AND or ORs where found together. Instead of
451 * climbing up and down the tree branches, an array of the 386 * climbing up and down the tree branches, an array of the
@@ -475,91 +410,99 @@ static int process_ops(struct filter_pred *preds,
475 410
476 for (i = 0; i < op->val; i++) { 411 for (i = 0; i < op->val; i++) {
477 pred = &preds[op->ops[i]]; 412 pred = &preds[op->ops[i]];
478 if (!WARN_ON_ONCE(!pred->fn)) 413 match = pred->fn(pred, rec);
479 match = pred->fn(pred, rec);
480 if (!!match == type) 414 if (!!match == type)
481 return match; 415 return match;
482 } 416 }
483 return match; 417 return match;
484} 418}
485 419
486struct filter_match_preds_data {
487 struct filter_pred *preds;
488 int match;
489 void *rec;
490};
491
492static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
493 int *err, void *data)
494{
495 struct filter_match_preds_data *d = data;
496
497 *err = 0;
498 switch (move) {
499 case MOVE_DOWN:
500 /* only AND and OR have children */
501 if (pred->left != FILTER_PRED_INVALID) {
502 /* If ops is set, then it was folded. */
503 if (!pred->ops)
504 return WALK_PRED_DEFAULT;
505 /* We can treat folded ops as a leaf node */
506 d->match = process_ops(d->preds, pred, d->rec);
507 } else {
508 if (!WARN_ON_ONCE(!pred->fn))
509 d->match = pred->fn(pred, d->rec);
510 }
511
512 return WALK_PRED_PARENT;
513 case MOVE_UP_FROM_LEFT:
514 /*
515 * Check for short circuits.
516 *
517 * Optimization: !!match == (pred->op == OP_OR)
518 * is the same as:
519 * if ((match && pred->op == OP_OR) ||
520 * (!match && pred->op == OP_AND))
521 */
522 if (!!d->match == (pred->op == OP_OR))
523 return WALK_PRED_PARENT;
524 break;
525 case MOVE_UP_FROM_RIGHT:
526 break;
527 }
528
529 return WALK_PRED_DEFAULT;
530}
531
532/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
533int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
534{ 422{
423 int match = -1;
424 enum move_type move = MOVE_DOWN;
535 struct filter_pred *preds; 425 struct filter_pred *preds;
426 struct filter_pred *pred;
536 struct filter_pred *root; 427 struct filter_pred *root;
537 struct filter_match_preds_data data = { 428 int n_preds;
538 /* match is currently meaningless */ 429 int done = 0;
539 .match = -1,
540 .rec = rec,
541 };
542 int n_preds, ret;
543 430
544 /* no filter is considered a match */ 431 /* no filter is considered a match */
545 if (!filter) 432 if (!filter)
546 return 1; 433 return 1;
547 434
548 n_preds = filter->n_preds; 435 n_preds = filter->n_preds;
436
549 if (!n_preds) 437 if (!n_preds)
550 return 1; 438 return 1;
551 439
552 /* 440 /*
553 * n_preds, root and filter->preds are protect with preemption disabled. 441 * n_preds, root and filter->preds are protect with preemption disabled.
554 */ 442 */
443 preds = rcu_dereference_sched(filter->preds);
555 root = rcu_dereference_sched(filter->root); 444 root = rcu_dereference_sched(filter->root);
556 if (!root) 445 if (!root)
557 return 1; 446 return 1;
558 447
559 data.preds = preds = rcu_dereference_sched(filter->preds); 448 pred = root;
560 ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); 449
561 WARN_ON(ret); 450 /* match is currently meaningless */
562 return data.match; 451 match = -1;
452
453 do {
454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
500 continue;
501 }
502 done = 1;
503 } while (!done);
504
505 return match;
563} 506}
564EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
565 508
@@ -654,7 +597,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
654 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
655 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
656 else 599 else
657 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); 600 trace_seq_printf(s, "none\n");
658 mutex_unlock(&event_mutex); 601 mutex_unlock(&event_mutex);
659} 602}
660 603
@@ -685,9 +628,25 @@ find_event_field(struct ftrace_event_call *call, char *name)
685 return __find_event_field(head, name); 628 return __find_event_field(head, name);
686} 629}
687 630
631static void filter_free_pred(struct filter_pred *pred)
632{
633 if (!pred)
634 return;
635
636 kfree(pred->field_name);
637 kfree(pred);
638}
639
640static void filter_clear_pred(struct filter_pred *pred)
641{
642 kfree(pred->field_name);
643 pred->field_name = NULL;
644 pred->regex.len = 0;
645}
646
688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
689{ 648{
690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); 649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
691 if (!stack->preds) 650 if (!stack->preds)
692 return -ENOMEM; 651 return -ENOMEM;
693 stack->index = n_preds; 652 stack->index = n_preds;
@@ -730,13 +689,20 @@ __pop_pred_stack(struct pred_stack *stack)
730static int filter_set_pred(struct event_filter *filter, 689static int filter_set_pred(struct event_filter *filter,
731 int idx, 690 int idx,
732 struct pred_stack *stack, 691 struct pred_stack *stack,
733 struct filter_pred *src) 692 struct filter_pred *src,
693 filter_pred_fn_t fn)
734{ 694{
735 struct filter_pred *dest = &filter->preds[idx]; 695 struct filter_pred *dest = &filter->preds[idx];
736 struct filter_pred *left; 696 struct filter_pred *left;
737 struct filter_pred *right; 697 struct filter_pred *right;
738 698
739 *dest = *src; 699 *dest = *src;
700 if (src->field_name) {
701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
702 if (!dest->field_name)
703 return -ENOMEM;
704 }
705 dest->fn = fn;
740 dest->index = idx; 706 dest->index = idx;
741 707
742 if (dest->op == OP_OR || dest->op == OP_AND) { 708 if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -777,7 +743,11 @@ static int filter_set_pred(struct event_filter *filter,
777 743
778static void __free_preds(struct event_filter *filter) 744static void __free_preds(struct event_filter *filter)
779{ 745{
746 int i;
747
780 if (filter->preds) { 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
781 kfree(filter->preds); 751 kfree(filter->preds);
782 filter->preds = NULL; 752 filter->preds = NULL;
783 } 753 }
@@ -828,7 +798,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
828 if (filter->preds) 798 if (filter->preds)
829 __free_preds(filter); 799 __free_preds(filter);
830 800
831 filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
832 803
833 if (!filter->preds) 804 if (!filter->preds)
834 return -ENOMEM; 805 return -ENOMEM;
@@ -869,19 +840,23 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
869 } 840 }
870} 841}
871 842
872static int filter_add_pred(struct filter_parse_state *ps, 843static int filter_add_pred_fn(struct filter_parse_state *ps,
873 struct event_filter *filter, 844 struct ftrace_event_call *call,
874 struct filter_pred *pred, 845 struct event_filter *filter,
875 struct pred_stack *stack) 846 struct filter_pred *pred,
847 struct pred_stack *stack,
848 filter_pred_fn_t fn)
876{ 849{
877 int err; 850 int idx, err;
878 851
879 if (WARN_ON(filter->n_preds == filter->a_preds)) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
880 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
881 return -ENOSPC; 854 return -ENOSPC;
882 } 855 }
883 856
884 err = filter_set_pred(filter, filter->n_preds, stack, pred); 857 idx = filter->n_preds;
858 filter_clear_pred(&filter->preds[idx]);
859 err = filter_set_pred(filter, idx, stack, pred, fn);
885 if (err) 860 if (err)
886 return err; 861 return err;
887 862
@@ -901,11 +876,6 @@ int filter_assign_type(const char *type)
901 return FILTER_OTHER; 876 return FILTER_OTHER;
902} 877}
903 878
904static bool is_function_field(struct ftrace_event_field *field)
905{
906 return field->filter_type == FILTER_TRACE_FN;
907}
908
909static bool is_string_field(struct ftrace_event_field *field) 879static bool is_string_field(struct ftrace_event_field *field)
910{ 880{
911 return field->filter_type == FILTER_DYN_STRING || 881 return field->filter_type == FILTER_DYN_STRING ||
@@ -967,15 +937,31 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
967 return fn; 937 return fn;
968} 938}
969 939
970static int init_pred(struct filter_parse_state *ps, 940static int filter_add_pred(struct filter_parse_state *ps,
971 struct ftrace_event_field *field, 941 struct ftrace_event_call *call,
972 struct filter_pred *pred) 942 struct event_filter *filter,
973 943 struct filter_pred *pred,
944 struct pred_stack *stack,
945 bool dry_run)
974{ 946{
975 filter_pred_fn_t fn = filter_pred_none; 947 struct ftrace_event_field *field;
948 filter_pred_fn_t fn;
976 unsigned long long val; 949 unsigned long long val;
977 int ret; 950 int ret;
978 951
952 fn = pred->fn = filter_pred_none;
953
954 if (pred->op == OP_AND)
955 goto add_pred_fn;
956 else if (pred->op == OP_OR)
957 goto add_pred_fn;
958
959 field = find_event_field(call, pred->field_name);
960 if (!field) {
961 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
962 return -EINVAL;
963 }
964
979 pred->offset = field->offset; 965 pred->offset = field->offset;
980 966
981 if (!is_legal_op(field, pred->op)) { 967 if (!is_legal_op(field, pred->op)) {
@@ -993,16 +979,11 @@ static int init_pred(struct filter_parse_state *ps,
993 fn = filter_pred_strloc; 979 fn = filter_pred_strloc;
994 else 980 else
995 fn = filter_pred_pchar; 981 fn = filter_pred_pchar;
996 } else if (is_function_field(field)) {
997 if (strcmp(field->name, "ip")) {
998 parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
999 return -EINVAL;
1000 }
1001 } else { 982 } else {
1002 if (field->is_signed) 983 if (field->is_signed)
1003 ret = kstrtoll(pred->regex.pattern, 0, &val); 984 ret = strict_strtoll(pred->regex.pattern, 0, &val);
1004 else 985 else
1005 ret = kstrtoull(pred->regex.pattern, 0, &val); 986 ret = strict_strtoull(pred->regex.pattern, 0, &val);
1006 if (ret) { 987 if (ret) {
1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 988 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1008 return -EINVAL; 989 return -EINVAL;
@@ -1020,7 +1001,9 @@ static int init_pred(struct filter_parse_state *ps,
1020 if (pred->op == OP_NE) 1001 if (pred->op == OP_NE)
1021 pred->not = 1; 1002 pred->not = 1;
1022 1003
1023 pred->fn = fn; 1004add_pred_fn:
1005 if (!dry_run)
1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
1024 return 0; 1007 return 0;
1025} 1008}
1026 1009
@@ -1319,34 +1302,39 @@ parse_operand:
1319 return 0; 1302 return 0;
1320} 1303}
1321 1304
1322static struct filter_pred *create_pred(struct filter_parse_state *ps, 1305static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1323 struct ftrace_event_call *call,
1324 int op, char *operand1, char *operand2)
1325{ 1306{
1326 struct ftrace_event_field *field; 1307 struct filter_pred *pred;
1327 static struct filter_pred pred;
1328
1329 memset(&pred, 0, sizeof(pred));
1330 pred.op = op;
1331 1308
1332 if (op == OP_AND || op == OP_OR) 1309 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1333 return &pred; 1310 if (!pred)
1311 return NULL;
1334 1312
1335 if (!operand1 || !operand2) { 1313 pred->field_name = kstrdup(operand1, GFP_KERNEL);
1336 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1314 if (!pred->field_name) {
1315 kfree(pred);
1337 return NULL; 1316 return NULL;
1338 } 1317 }
1339 1318
1340 field = find_event_field(call, operand1); 1319 strcpy(pred->regex.pattern, operand2);
1341 if (!field) { 1320 pred->regex.len = strlen(pred->regex.pattern);
1342 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); 1321
1322 pred->op = op;
1323
1324 return pred;
1325}
1326
1327static struct filter_pred *create_logical_pred(int op)
1328{
1329 struct filter_pred *pred;
1330
1331 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1332 if (!pred)
1343 return NULL; 1333 return NULL;
1344 }
1345 1334
1346 strcpy(pred.regex.pattern, operand2); 1335 pred->op = op;
1347 pred.regex.len = strlen(pred.regex.pattern); 1336
1348 pred.field = field; 1337 return pred;
1349 return init_pred(ps, field, &pred) ? NULL : &pred;
1350} 1338}
1351 1339
1352static int check_preds(struct filter_parse_state *ps) 1340static int check_preds(struct filter_parse_state *ps)
@@ -1387,23 +1375,6 @@ static int count_preds(struct filter_parse_state *ps)
1387 return n_preds; 1375 return n_preds;
1388} 1376}
1389 1377
1390struct check_pred_data {
1391 int count;
1392 int max;
1393};
1394
1395static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1396 int *err, void *data)
1397{
1398 struct check_pred_data *d = data;
1399
1400 if (WARN_ON(d->count++ > d->max)) {
1401 *err = -EINVAL;
1402 return WALK_PRED_ABORT;
1403 }
1404 return WALK_PRED_DEFAULT;
1405}
1406
1407/* 1378/*
1408 * The tree is walked at filtering of an event. If the tree is not correctly 1379 * The tree is walked at filtering of an event. If the tree is not correctly
1409 * built, it may cause an infinite loop. Check here that the tree does 1380 * built, it may cause an infinite loop. Check here that the tree does
@@ -1412,76 +1383,107 @@ static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1412static int check_pred_tree(struct event_filter *filter, 1383static int check_pred_tree(struct event_filter *filter,
1413 struct filter_pred *root) 1384 struct filter_pred *root)
1414{ 1385{
1415 struct check_pred_data data = { 1386 struct filter_pred *preds;
1416 /* 1387 struct filter_pred *pred;
1417 * The max that we can hit a node is three times. 1388 enum move_type move = MOVE_DOWN;
1418 * Once going down, once coming up from left, and 1389 int count = 0;
1419 * once coming up from right. This is more than enough 1390 int done = 0;
1420 * since leafs are only hit a single time. 1391 int max;
1421 */
1422 .max = 3 * filter->n_preds,
1423 .count = 0,
1424 };
1425
1426 return walk_pred_tree(filter->preds, root,
1427 check_pred_tree_cb, &data);
1428}
1429 1392
1430static int count_leafs_cb(enum move_type move, struct filter_pred *pred, 1393 /*
1431 int *err, void *data) 1394 * The max that we can hit a node is three times.
1432{ 1395 * Once going down, once coming up from left, and
1433 int *count = data; 1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1434 1400
1435 if ((move == MOVE_DOWN) && 1401 preds = filter->preds;
1436 (pred->left == FILTER_PRED_INVALID)) 1402 if (!preds)
1437 (*count)++; 1403 return -EINVAL;
1404 pred = root;
1438 1405
1439 return WALK_PRED_DEFAULT; 1406 do {
1440} 1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1441 1409
1442static int count_leafs(struct filter_pred *preds, struct filter_pred *root) 1410 switch (move) {
1443{ 1411 case MOVE_DOWN:
1444 int count = 0, ret; 1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1445 1435
1446 ret = walk_pred_tree(preds, root, count_leafs_cb, &count); 1436 /* We are fine. */
1447 WARN_ON(ret); 1437 return 0;
1448 return count;
1449} 1438}
1450 1439
1451struct fold_pred_data { 1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1452 struct filter_pred *root;
1453 int count;
1454 int children;
1455};
1456
1457static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
1458 int *err, void *data)
1459{ 1441{
1460 struct fold_pred_data *d = data; 1442 struct filter_pred *pred;
1461 struct filter_pred *root = d->root; 1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1462 1446
1463 if (move != MOVE_DOWN) 1447 pred = root;
1464 return WALK_PRED_DEFAULT;
1465 if (pred->left != FILTER_PRED_INVALID)
1466 return WALK_PRED_DEFAULT;
1467 1448
1468 if (WARN_ON(d->count == d->children)) { 1449 do {
1469 *err = -EINVAL; 1450 switch (move) {
1470 return WALK_PRED_ABORT; 1451 case MOVE_DOWN:
1471 } 1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1472 1476
1473 pred->index &= ~FILTER_PRED_FOLD; 1477 return count;
1474 root->ops[d->count++] = pred->index;
1475 return WALK_PRED_DEFAULT;
1476} 1478}
1477 1479
1478static int fold_pred(struct filter_pred *preds, struct filter_pred *root) 1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1479{ 1481{
1480 struct fold_pred_data data = { 1482 struct filter_pred *pred;
1481 .root = root, 1483 enum move_type move = MOVE_DOWN;
1482 .count = 0, 1484 int count = 0;
1483 };
1484 int children; 1485 int children;
1486 int done = 0;
1485 1487
1486 /* No need to keep the fold flag */ 1488 /* No need to keep the fold flag */
1487 root->index &= ~FILTER_PRED_FOLD; 1489 root->index &= ~FILTER_PRED_FOLD;
@@ -1494,31 +1496,42 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1494 children = count_leafs(preds, &preds[root->left]); 1496 children = count_leafs(preds, &preds[root->left]);
1495 children += count_leafs(preds, &preds[root->right]); 1497 children += count_leafs(preds, &preds[root->right]);
1496 1498
1497 root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); 1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1498 if (!root->ops) 1500 if (!root->ops)
1499 return -ENOMEM; 1501 return -ENOMEM;
1500 1502
1501 root->val = children; 1503 root->val = children;
1502 data.children = children;
1503 return walk_pred_tree(preds, root, fold_pred_cb, &data);
1504}
1505
1506static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1507 int *err, void *data)
1508{
1509 struct filter_pred *preds = data;
1510 1504
1511 if (move != MOVE_DOWN) 1505 pred = root;
1512 return WALK_PRED_DEFAULT; 1506 do {
1513 if (!(pred->index & FILTER_PRED_FOLD)) 1507 switch (move) {
1514 return WALK_PRED_DEFAULT; 1508 case MOVE_DOWN:
1515 1509 if (pred->left != FILTER_PRED_INVALID) {
1516 *err = fold_pred(preds, pred); 1510 pred = &preds[pred->left];
1517 if (*err) 1511 continue;
1518 return WALK_PRED_ABORT; 1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1519 1533
1520 /* eveyrhing below is folded, continue with parent */ 1534 return 0;
1521 return WALK_PRED_PARENT;
1522} 1535}
1523 1536
1524/* 1537/*
@@ -1529,8 +1542,51 @@ static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1529static int fold_pred_tree(struct event_filter *filter, 1542static int fold_pred_tree(struct event_filter *filter,
1530 struct filter_pred *root) 1543 struct filter_pred *root)
1531{ 1544{
1532 return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, 1545 struct filter_pred *preds;
1533 filter->preds); 1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1534} 1590}
1535 1591
1536static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
@@ -1587,17 +1643,27 @@ static int replace_preds(struct ftrace_event_call *call,
1587 goto fail; 1643 goto fail;
1588 } 1644 }
1589 1645
1590 pred = create_pred(ps, call, elt->op, operand1, operand2); 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
1591 if (!pred) { 1647 pred = create_logical_pred(elt->op);
1648 goto add_pred;
1649 }
1650
1651 if (!operand1 || !operand2) {
1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1592 err = -EINVAL; 1653 err = -EINVAL;
1593 goto fail; 1654 goto fail;
1594 } 1655 }
1595 1656
1596 if (!dry_run) { 1657 pred = create_pred(elt->op, operand1, operand2);
1597 err = filter_add_pred(ps, filter, pred, &stack); 1658add_pred:
1598 if (err) 1659 if (!pred) {
1599 goto fail; 1660 err = -ENOMEM;
1661 goto fail;
1600 } 1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1664 filter_free_pred(pred);
1665 if (err)
1666 goto fail;
1601 1667
1602 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1603 } 1669 }
@@ -1663,9 +1729,7 @@ static int replace_system_preds(struct event_subsystem *system,
1663 */ 1729 */
1664 err = replace_preds(call, NULL, ps, filter_string, true); 1730 err = replace_preds(call, NULL, ps, filter_string, true);
1665 if (err) 1731 if (err)
1666 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; 1732 goto fail;
1667 else
1668 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
1669 } 1733 }
1670 1734
1671 list_for_each_entry(call, &ftrace_events, list) { 1735 list_for_each_entry(call, &ftrace_events, list) {
@@ -1674,9 +1738,6 @@ static int replace_system_preds(struct event_subsystem *system,
1674 if (strcmp(call->class->system, system->name) != 0) 1738 if (strcmp(call->class->system, system->name) != 0)
1675 continue; 1739 continue;
1676 1740
1677 if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
1678 continue;
1679
1680 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1681 if (!filter_item) 1742 if (!filter_item)
1682 goto fail_mem; 1743 goto fail_mem;
@@ -1746,121 +1807,11 @@ static int replace_system_preds(struct event_subsystem *system,
1746 return -ENOMEM; 1807 return -ENOMEM;
1747} 1808}
1748 1809
1749static int create_filter_start(char *filter_str, bool set_str,
1750 struct filter_parse_state **psp,
1751 struct event_filter **filterp)
1752{
1753 struct event_filter *filter;
1754 struct filter_parse_state *ps = NULL;
1755 int err = 0;
1756
1757 WARN_ON_ONCE(*psp || *filterp);
1758
1759 /* allocate everything, and if any fails, free all and fail */
1760 filter = __alloc_filter();
1761 if (filter && set_str)
1762 err = replace_filter_string(filter, filter_str);
1763
1764 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1765
1766 if (!filter || !ps || err) {
1767 kfree(ps);
1768 __free_filter(filter);
1769 return -ENOMEM;
1770 }
1771
1772 /* we're committed to creating a new filter */
1773 *filterp = filter;
1774 *psp = ps;
1775
1776 parse_init(ps, filter_ops, filter_str);
1777 err = filter_parse(ps);
1778 if (err && set_str)
1779 append_filter_err(ps, filter);
1780 return err;
1781}
1782
1783static void create_filter_finish(struct filter_parse_state *ps)
1784{
1785 if (ps) {
1786 filter_opstack_clear(ps);
1787 postfix_clear(ps);
1788 kfree(ps);
1789 }
1790}
1791
1792/**
1793 * create_filter - create a filter for a ftrace_event_call
1794 * @call: ftrace_event_call to create a filter for
1795 * @filter_str: filter string
1796 * @set_str: remember @filter_str and enable detailed error in filter
1797 * @filterp: out param for created filter (always updated on return)
1798 *
1799 * Creates a filter for @call with @filter_str. If @set_str is %true,
1800 * @filter_str is copied and recorded in the new filter.
1801 *
1802 * On success, returns 0 and *@filterp points to the new filter. On
1803 * failure, returns -errno and *@filterp may point to %NULL or to a new
1804 * filter. In the latter case, the returned filter contains error
1805 * information if @set_str is %true and the caller is responsible for
1806 * freeing it.
1807 */
1808static int create_filter(struct ftrace_event_call *call,
1809 char *filter_str, bool set_str,
1810 struct event_filter **filterp)
1811{
1812 struct event_filter *filter = NULL;
1813 struct filter_parse_state *ps = NULL;
1814 int err;
1815
1816 err = create_filter_start(filter_str, set_str, &ps, &filter);
1817 if (!err) {
1818 err = replace_preds(call, filter, ps, filter_str, false);
1819 if (err && set_str)
1820 append_filter_err(ps, filter);
1821 }
1822 create_filter_finish(ps);
1823
1824 *filterp = filter;
1825 return err;
1826}
1827
1828/**
1829 * create_system_filter - create a filter for an event_subsystem
1830 * @system: event_subsystem to create a filter for
1831 * @filter_str: filter string
1832 * @filterp: out param for created filter (always updated on return)
1833 *
1834 * Identical to create_filter() except that it creates a subsystem filter
1835 * and always remembers @filter_str.
1836 */
1837static int create_system_filter(struct event_subsystem *system,
1838 char *filter_str, struct event_filter **filterp)
1839{
1840 struct event_filter *filter = NULL;
1841 struct filter_parse_state *ps = NULL;
1842 int err;
1843
1844 err = create_filter_start(filter_str, true, &ps, &filter);
1845 if (!err) {
1846 err = replace_system_preds(system, ps, filter_str);
1847 if (!err) {
1848 /* System filters just show a default message */
1849 kfree(filter->filter_string);
1850 filter->filter_string = NULL;
1851 } else {
1852 append_filter_err(ps, filter);
1853 }
1854 }
1855 create_filter_finish(ps);
1856
1857 *filterp = filter;
1858 return err;
1859}
1860
1861int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1862{ 1811{
1812 struct filter_parse_state *ps;
1863 struct event_filter *filter; 1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1864 int err = 0; 1815 int err = 0;
1865 1816
1866 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
@@ -1877,30 +1828,49 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1877 goto out_unlock; 1828 goto out_unlock;
1878 } 1829 }
1879 1830
1880 err = create_filter(call, filter_string, true, &filter); 1831 err = -ENOMEM;
1832 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1833 if (!ps)
1834 goto out_unlock;
1835
1836 filter = __alloc_filter();
1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1881 1843
1844 parse_init(ps, filter_ops, filter_string);
1845 err = filter_parse(ps);
1846 if (err) {
1847 append_filter_err(ps, filter);
1848 goto out;
1849 }
1850
1851 err = replace_preds(call, filter, ps, filter_string, false);
1852 if (err) {
1853 filter_disable(call);
1854 append_filter_err(ps, filter);
1855 } else
1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1857out:
1882 /* 1858 /*
1883 * Always swap the call filter with the new filter 1859 * Always swap the call filter with the new filter
1884 * even if there was an error. If there was an error 1860 * even if there was an error. If there was an error
1885 * in the filter, we disable the filter and show the error 1861 * in the filter, we disable the filter and show the error
1886 * string 1862 * string
1887 */ 1863 */
1888 if (filter) { 1864 tmp = call->filter;
1889 struct event_filter *tmp = call->filter; 1865 rcu_assign_pointer(call->filter, filter);
1890 1866 if (tmp) {
1891 if (!err) 1867 /* Make sure the call is done with the filter */
1892 call->flags |= TRACE_EVENT_FL_FILTERED; 1868 synchronize_sched();
1893 else 1869 __free_filter(tmp);
1894 filter_disable(call);
1895
1896 rcu_assign_pointer(call->filter, filter);
1897
1898 if (tmp) {
1899 /* Make sure the call is done with the filter */
1900 synchronize_sched();
1901 __free_filter(tmp);
1902 }
1903 } 1870 }
1871 filter_opstack_clear(ps);
1872 postfix_clear(ps);
1873 kfree(ps);
1904out_unlock: 1874out_unlock:
1905 mutex_unlock(&event_mutex); 1875 mutex_unlock(&event_mutex);
1906 1876
@@ -1910,6 +1880,7 @@ out_unlock:
1910int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1911 char *filter_string) 1881 char *filter_string)
1912{ 1882{
1883 struct filter_parse_state *ps;
1913 struct event_filter *filter; 1884 struct event_filter *filter;
1914 int err = 0; 1885 int err = 0;
1915 1886
@@ -1933,15 +1904,38 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1933 goto out_unlock; 1904 goto out_unlock;
1934 } 1905 }
1935 1906
1936 err = create_system_filter(system, filter_string, &filter); 1907 err = -ENOMEM;
1937 if (filter) { 1908 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1938 /* 1909 if (!ps)
1939 * No event actually uses the system filter 1910 goto out_unlock;
1940 * we can free it without synchronize_sched(). 1911
1941 */ 1912 filter = __alloc_filter();
1942 __free_filter(system->filter); 1913 if (!filter)
1943 system->filter = filter; 1914 goto out;
1915
1916 replace_filter_string(filter, filter_string);
1917 /*
1918 * No event actually uses the system filter
1919 * we can free it without synchronize_sched().
1920 */
1921 __free_filter(system->filter);
1922 system->filter = filter;
1923
1924 parse_init(ps, filter_ops, filter_string);
1925 err = filter_parse(ps);
1926 if (err) {
1927 append_filter_err(ps, system->filter);
1928 goto out;
1944 } 1929 }
1930
1931 err = replace_system_preds(system, ps, filter_string);
1932 if (err)
1933 append_filter_err(ps, system->filter);
1934
1935out:
1936 filter_opstack_clear(ps);
1937 postfix_clear(ps);
1938 kfree(ps);
1945out_unlock: 1939out_unlock:
1946 mutex_unlock(&event_mutex); 1940 mutex_unlock(&event_mutex);
1947 1941
@@ -1958,178 +1952,56 @@ void ftrace_profile_free_filter(struct perf_event *event)
1958 __free_filter(filter); 1952 __free_filter(filter);
1959} 1953}
1960 1954
1961struct function_filter_data {
1962 struct ftrace_ops *ops;
1963 int first_filter;
1964 int first_notrace;
1965};
1966
1967#ifdef CONFIG_FUNCTION_TRACER
1968static char **
1969ftrace_function_filter_re(char *buf, int len, int *count)
1970{
1971 char *str, *sep, **re;
1972
1973 str = kstrndup(buf, len, GFP_KERNEL);
1974 if (!str)
1975 return NULL;
1976
1977 /*
1978 * The argv_split function takes white space
1979 * as a separator, so convert ',' into spaces.
1980 */
1981 while ((sep = strchr(str, ',')))
1982 *sep = ' ';
1983
1984 re = argv_split(GFP_KERNEL, str, count);
1985 kfree(str);
1986 return re;
1987}
1988
1989static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
1990 int reset, char *re, int len)
1991{
1992 int ret;
1993
1994 if (filter)
1995 ret = ftrace_set_filter(ops, re, len, reset);
1996 else
1997 ret = ftrace_set_notrace(ops, re, len, reset);
1998
1999 return ret;
2000}
2001
2002static int __ftrace_function_set_filter(int filter, char *buf, int len,
2003 struct function_filter_data *data)
2004{
2005 int i, re_cnt, ret = -EINVAL;
2006 int *reset;
2007 char **re;
2008
2009 reset = filter ? &data->first_filter : &data->first_notrace;
2010
2011 /*
2012 * The 'ip' field could have multiple filters set, separated
2013 * either by space or comma. We first cut the filter and apply
2014 * all pieces separatelly.
2015 */
2016 re = ftrace_function_filter_re(buf, len, &re_cnt);
2017 if (!re)
2018 return -EINVAL;
2019
2020 for (i = 0; i < re_cnt; i++) {
2021 ret = ftrace_function_set_regexp(data->ops, filter, *reset,
2022 re[i], strlen(re[i]));
2023 if (ret)
2024 break;
2025
2026 if (*reset)
2027 *reset = 0;
2028 }
2029
2030 argv_free(re);
2031 return ret;
2032}
2033
2034static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
2035{
2036 struct ftrace_event_field *field = pred->field;
2037
2038 if (leaf) {
2039 /*
2040 * Check the leaf predicate for function trace, verify:
2041 * - only '==' and '!=' is used
2042 * - the 'ip' field is used
2043 */
2044 if ((pred->op != OP_EQ) && (pred->op != OP_NE))
2045 return -EINVAL;
2046
2047 if (strcmp(field->name, "ip"))
2048 return -EINVAL;
2049 } else {
2050 /*
2051 * Check the non leaf predicate for function trace, verify:
2052 * - only '||' is used
2053 */
2054 if (pred->op != OP_OR)
2055 return -EINVAL;
2056 }
2057
2058 return 0;
2059}
2060
2061static int ftrace_function_set_filter_cb(enum move_type move,
2062 struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 /* Checking the node is valid for function trace. */
2066 if ((move != MOVE_DOWN) ||
2067 (pred->left != FILTER_PRED_INVALID)) {
2068 *err = ftrace_function_check_pred(pred, 0);
2069 } else {
2070 *err = ftrace_function_check_pred(pred, 1);
2071 if (*err)
2072 return WALK_PRED_ABORT;
2073
2074 *err = __ftrace_function_set_filter(pred->op == OP_EQ,
2075 pred->regex.pattern,
2076 pred->regex.len,
2077 data);
2078 }
2079
2080 return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
2081}
2082
2083static int ftrace_function_set_filter(struct perf_event *event,
2084 struct event_filter *filter)
2085{
2086 struct function_filter_data data = {
2087 .first_filter = 1,
2088 .first_notrace = 1,
2089 .ops = &event->ftrace_ops,
2090 };
2091
2092 return walk_pred_tree(filter->preds, filter->root,
2093 ftrace_function_set_filter_cb, &data);
2094}
2095#else
2096static int ftrace_function_set_filter(struct perf_event *event,
2097 struct event_filter *filter)
2098{
2099 return -ENODEV;
2100}
2101#endif /* CONFIG_FUNCTION_TRACER */
2102
2103int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1955int ftrace_profile_set_filter(struct perf_event *event, int event_id,
2104 char *filter_str) 1956 char *filter_str)
2105{ 1957{
2106 int err; 1958 int err;
2107 struct event_filter *filter; 1959 struct event_filter *filter;
2108 struct ftrace_event_call *call; 1960 struct filter_parse_state *ps;
1961 struct ftrace_event_call *call = NULL;
2109 1962
2110 mutex_lock(&event_mutex); 1963 mutex_lock(&event_mutex);
2111 1964
2112 call = event->tp_event; 1965 list_for_each_entry(call, &ftrace_events, list) {
1966 if (call->event.type == event_id)
1967 break;
1968 }
2113 1969
2114 err = -EINVAL; 1970 err = -EINVAL;
2115 if (!call) 1971 if (&call->list == &ftrace_events)
2116 goto out_unlock; 1972 goto out_unlock;
2117 1973
2118 err = -EEXIST; 1974 err = -EEXIST;
2119 if (event->filter) 1975 if (event->filter)
2120 goto out_unlock; 1976 goto out_unlock;
2121 1977
2122 err = create_filter(call, filter_str, false, &filter); 1978 filter = __alloc_filter();
2123 if (err) 1979 if (!filter) {
1980 err = PTR_ERR(filter);
1981 goto out_unlock;
1982 }
1983
1984 err = -ENOMEM;
1985 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1986 if (!ps)
2124 goto free_filter; 1987 goto free_filter;
2125 1988
2126 if (ftrace_event_is_function(call)) 1989 parse_init(ps, filter_ops, filter_str);
2127 err = ftrace_function_set_filter(event, filter); 1990 err = filter_parse(ps);
2128 else 1991 if (err)
1992 goto free_ps;
1993
1994 err = replace_preds(call, filter, ps, filter_str, false);
1995 if (!err)
2129 event->filter = filter; 1996 event->filter = filter;
2130 1997
1998free_ps:
1999 filter_opstack_clear(ps);
2000 postfix_clear(ps);
2001 kfree(ps);
2002
2131free_filter: 2003free_filter:
2132 if (err || ftrace_event_is_function(call)) 2004 if (err)
2133 __free_filter(filter); 2005 __free_filter(filter);
2134 2006
2135out_unlock: 2007out_unlock:
@@ -2140,179 +2012,3 @@ out_unlock:
2140 2012
2141#endif /* CONFIG_PERF_EVENTS */ 2013#endif /* CONFIG_PERF_EVENTS */
2142 2014
2143#ifdef CONFIG_FTRACE_STARTUP_TEST
2144
2145#include <linux/types.h>
2146#include <linux/tracepoint.h>
2147
2148#define CREATE_TRACE_POINTS
2149#include "trace_events_filter_test.h"
2150
2151#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
2152{ \
2153 .filter = FILTER, \
2154 .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
2155 .e = ve, .f = vf, .g = vg, .h = vh }, \
2156 .match = m, \
2157 .not_visited = nvisit, \
2158}
2159#define YES 1
2160#define NO 0
2161
2162static struct test_filter_data_t {
2163 char *filter;
2164 struct ftrace_raw_ftrace_test_filter rec;
2165 int match;
2166 char *not_visited;
2167} test_filter_data[] = {
2168#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
2169 "e == 1 && f == 1 && g == 1 && h == 1"
2170 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
2171 DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
2172 DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
2173#undef FILTER
2174#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
2175 "e == 1 || f == 1 || g == 1 || h == 1"
2176 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2177 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2178 DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
2179#undef FILTER
2180#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
2181 "(e == 1 || f == 1) && (g == 1 || h == 1)"
2182 DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
2183 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2184 DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
2185 DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
2186#undef FILTER
2187#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
2188 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2189 DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
2190 DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
2191 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2192#undef FILTER
2193#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
2194 "(e == 1 && f == 1) || (g == 1 && h == 1)"
2195 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
2196 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
2197 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
2198#undef FILTER
2199#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
2200 "(e == 1 || f == 1)) && (g == 1 || h == 1)"
2201 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
2202 DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
2203 DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
2204#undef FILTER
2205#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
2206 "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
2207 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
2208 DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2209 DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
2210#undef FILTER
2211#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
2212 "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
2213 DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
2214 DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
2215 DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
2216};
2217
2218#undef DATA_REC
2219#undef FILTER
2220#undef YES
2221#undef NO
2222
2223#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
2224
2225static int test_pred_visited;
2226
2227static int test_pred_visited_fn(struct filter_pred *pred, void *event)
2228{
2229 struct ftrace_event_field *field = pred->field;
2230
2231 test_pred_visited = 1;
2232 printk(KERN_INFO "\npred visited %s\n", field->name);
2233 return 1;
2234}
2235
2236static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
2237 int *err, void *data)
2238{
2239 char *fields = data;
2240
2241 if ((move == MOVE_DOWN) &&
2242 (pred->left == FILTER_PRED_INVALID)) {
2243 struct ftrace_event_field *field = pred->field;
2244
2245 if (!field) {
2246 WARN(1, "all leafs should have field defined");
2247 return WALK_PRED_DEFAULT;
2248 }
2249 if (!strchr(fields, *field->name))
2250 return WALK_PRED_DEFAULT;
2251
2252 WARN_ON(!pred->fn);
2253 pred->fn = test_pred_visited_fn;
2254 }
2255 return WALK_PRED_DEFAULT;
2256}
2257
2258static __init int ftrace_test_event_filter(void)
2259{
2260 int i;
2261
2262 printk(KERN_INFO "Testing ftrace filter: ");
2263
2264 for (i = 0; i < DATA_CNT; i++) {
2265 struct event_filter *filter = NULL;
2266 struct test_filter_data_t *d = &test_filter_data[i];
2267 int err;
2268
2269 err = create_filter(&event_ftrace_test_filter, d->filter,
2270 false, &filter);
2271 if (err) {
2272 printk(KERN_INFO
2273 "Failed to get filter for '%s', err %d\n",
2274 d->filter, err);
2275 __free_filter(filter);
2276 break;
2277 }
2278
2279 /*
2280 * The preemption disabling is not really needed for self
2281 * tests, but the rcu dereference will complain without it.
2282 */
2283 preempt_disable();
2284 if (*d->not_visited)
2285 walk_pred_tree(filter->preds, filter->root,
2286 test_walk_pred_cb,
2287 d->not_visited);
2288
2289 test_pred_visited = 0;
2290 err = filter_match_preds(filter, &d->rec);
2291 preempt_enable();
2292
2293 __free_filter(filter);
2294
2295 if (test_pred_visited) {
2296 printk(KERN_INFO
2297 "Failed, unwanted pred visited for filter %s\n",
2298 d->filter);
2299 break;
2300 }
2301
2302 if (err != d->match) {
2303 printk(KERN_INFO
2304 "Failed to match filter '%s', expected %d\n",
2305 d->filter, d->match);
2306 break;
2307 }
2308 }
2309
2310 if (i == DATA_CNT)
2311 printk(KERN_CONT "OK\n");
2312
2313 return 0;
2314}
2315
2316late_initcall(ftrace_test_event_filter);
2317
2318#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
deleted file mode 100644
index bfd4dba0d60..00000000000
--- a/kernel/trace/trace_events_filter_test.h
+++ /dev/null
@@ -1,50 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM test
3
4#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_TEST_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(ftrace_test_filter,
10
11 TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
12
13 TP_ARGS(a, b, c, d, e, f, g, h),
14
15 TP_STRUCT__entry(
16 __field(int, a)
17 __field(int, b)
18 __field(int, c)
19 __field(int, d)
20 __field(int, e)
21 __field(int, f)
22 __field(int, g)
23 __field(int, h)
24 ),
25
26 TP_fast_assign(
27 __entry->a = a;
28 __entry->b = b;
29 __entry->c = c;
30 __entry->d = d;
31 __entry->e = e;
32 __entry->f = f;
33 __entry->g = g;
34 __entry->h = h;
35 ),
36
37 TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
38 __entry->a, __entry->b, __entry->c, __entry->d,
39 __entry->e, __entry->f, __entry->g, __entry->h)
40);
41
42#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
43
44#undef TRACE_INCLUDE_PATH
45#undef TRACE_INCLUDE_FILE
46#define TRACE_INCLUDE_PATH .
47#define TRACE_INCLUDE_FILE trace_events_filter_test
48
49/* This part must be outside protection */
50#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037..bbeec31e0ae 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,16 +18,6 @@
18#undef TRACE_SYSTEM 18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace 19#define TRACE_SYSTEM ftrace
20 20
21/*
22 * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
23 * function and thus become accesible via perf.
24 */
25#undef FTRACE_ENTRY_REG
26#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
27 filter, regfn) \
28 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
29 filter)
30
31/* not needed for this file */ 21/* not needed for this file */
32#undef __field_struct 22#undef __field_struct
33#define __field_struct(type, item) 23#define __field_struct(type, item)
@@ -54,22 +44,21 @@
54#define F_printk(fmt, args...) fmt, args 44#define F_printk(fmt, args...) fmt, args
55 45
56#undef FTRACE_ENTRY 46#undef FTRACE_ENTRY
57#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
58struct ____ftrace_##name { \ 48struct ____ftrace_##name { \
59 tstruct \ 49 tstruct \
60}; \ 50}; \
61static void __always_unused ____ftrace_check_##name(void) \ 51static void __always_unused ____ftrace_check_##name(void) \
62{ \ 52{ \
63 struct ____ftrace_##name *__entry = NULL; \ 53 struct ____ftrace_##name *__entry = NULL; \
64 \ 54 \
65 /* force compile-time check on F_printk() */ \ 55 /* force compile-time check on F_printk() */ \
66 printk(print); \ 56 printk(print); \
67} 57}
68 58
69#undef FTRACE_ENTRY_DUP 59#undef FTRACE_ENTRY_DUP
70#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ 60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
71 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ 61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
72 filter)
73 62
74#include "trace_entries.h" 63#include "trace_entries.h"
75 64
@@ -78,7 +67,7 @@ static void __always_unused ____ftrace_check_##name(void) \
78 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
79 offsetof(typeof(field), item), \ 68 offsetof(typeof(field), item), \
80 sizeof(field.item), \ 69 sizeof(field.item), \
81 is_signed_type(type), filter_type); \ 70 is_signed_type(type), FILTER_OTHER); \
82 if (ret) \ 71 if (ret) \
83 return ret; 72 return ret;
84 73
@@ -88,7 +77,7 @@ static void __always_unused ____ftrace_check_##name(void) \
88 offsetof(typeof(field), \ 77 offsetof(typeof(field), \
89 container.item), \ 78 container.item), \
90 sizeof(field.container.item), \ 79 sizeof(field.container.item), \
91 is_signed_type(type), filter_type); \ 80 is_signed_type(type), FILTER_OTHER); \
92 if (ret) \ 81 if (ret) \
93 return ret; 82 return ret;
94 83
@@ -102,7 +91,7 @@ static void __always_unused ____ftrace_check_##name(void) \
102 ret = trace_define_field(event_call, event_storage, #item, \ 91 ret = trace_define_field(event_call, event_storage, #item, \
103 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
104 sizeof(field.item), \ 93 sizeof(field.item), \
105 is_signed_type(type), filter_type); \ 94 is_signed_type(type), FILTER_OTHER); \
106 mutex_unlock(&event_storage_mutex); \ 95 mutex_unlock(&event_storage_mutex); \
107 if (ret) \ 96 if (ret) \
108 return ret; \ 97 return ret; \
@@ -115,7 +104,7 @@ static void __always_unused ____ftrace_check_##name(void) \
115 offsetof(typeof(field), \ 104 offsetof(typeof(field), \
116 container.item), \ 105 container.item), \
117 sizeof(field.container.item), \ 106 sizeof(field.container.item), \
118 is_signed_type(type), filter_type); \ 107 is_signed_type(type), FILTER_OTHER); \
119 if (ret) \ 108 if (ret) \
120 return ret; 109 return ret;
121 110
@@ -123,18 +112,17 @@ static void __always_unused ____ftrace_check_##name(void) \
123#define __dynamic_array(type, item) \ 112#define __dynamic_array(type, item) \
124 ret = trace_define_field(event_call, #type, #item, \ 113 ret = trace_define_field(event_call, #type, #item, \
125 offsetof(typeof(field), item), \ 114 offsetof(typeof(field), item), \
126 0, is_signed_type(type), filter_type);\ 115 0, is_signed_type(type), FILTER_OTHER);\
127 if (ret) \ 116 if (ret) \
128 return ret; 117 return ret;
129 118
130#undef FTRACE_ENTRY 119#undef FTRACE_ENTRY
131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
132int \ 121int \
133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 122ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
134{ \ 123{ \
135 struct struct_name field; \ 124 struct struct_name field; \
136 int ret; \ 125 int ret; \
137 int filter_type = filter; \
138 \ 126 \
139 tstruct; \ 127 tstruct; \
140 \ 128 \
@@ -162,17 +150,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
162#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
163 151
164#undef F_printk 152#undef F_printk
165#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) 153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
166 154
167#undef FTRACE_ENTRY_REG 155#undef FTRACE_ENTRY
168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
169 regfn) \
170 \ 157 \
171struct ftrace_event_class event_class_ftrace_##call = { \ 158struct ftrace_event_class event_class_ftrace_##call = { \
172 .system = __stringify(TRACE_SYSTEM), \ 159 .system = __stringify(TRACE_SYSTEM), \
173 .define_fields = ftrace_define_fields_##call, \ 160 .define_fields = ftrace_define_fields_##call, \
174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
175 .reg = regfn, \
176}; \ 162}; \
177 \ 163 \
178struct ftrace_event_call __used event_##call = { \ 164struct ftrace_event_call __used event_##call = { \
@@ -180,19 +166,8 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 166 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 167 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 168 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
184}; \ 169}; \
185struct ftrace_event_call __used \ 170struct ftrace_event_call __used \
186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
187 172
188#undef FTRACE_ENTRY
189#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
190 FTRACE_ENTRY_REG(call, struct_name, etype, \
191 PARAMS(tstruct), PARAMS(print), filter, NULL)
192
193int ftrace_event_is_function(struct ftrace_event_call *call)
194{
195 return call == &event_function;
196}
197
198#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab..c7b0c6a7db0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/ring_buffer.h> 12#include <linux/ring_buffer.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -48,8 +48,7 @@ static void function_trace_start(struct trace_array *tr)
48} 48}
49 49
50static void 50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, 51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{ 52{
54 struct trace_array *tr = func_trace; 53 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
@@ -75,17 +74,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
75 preempt_enable_notrace(); 74 preempt_enable_notrace();
76} 75}
77 76
78/* Our option */
79enum {
80 TRACE_FUNC_OPT_STACK = 0x1,
81};
82
83static struct tracer_flags func_flags;
84
85static void 77static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 78function_trace_call(unsigned long ip, unsigned long parent_ip)
87 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 79{
90 struct trace_array *tr = func_trace; 80 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 81 struct trace_array_cpu *data;
@@ -116,8 +106,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
116} 106}
117 107
118static void 108static void
119function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 109function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 struct ftrace_ops *op, struct pt_regs *pt_regs)
121{ 110{
122 struct trace_array *tr = func_trace; 111 struct trace_array *tr = func_trace;
123 struct trace_array_cpu *data; 112 struct trace_array_cpu *data;
@@ -160,13 +149,18 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
160static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
161{ 150{
162 .func = function_trace_call, 151 .func = function_trace_call,
163 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 152 .flags = FTRACE_OPS_FL_GLOBAL,
164}; 153};
165 154
166static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
167{ 156{
168 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
169 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 158 .flags = FTRACE_OPS_FL_GLOBAL,
159};
160
161/* Our two options */
162enum {
163 TRACE_FUNC_OPT_STACK = 0x1,
170}; 164};
171 165
172static struct tracer_opt func_opts[] = { 166static struct tracer_opt func_opts[] = {
@@ -210,11 +204,10 @@ static void tracing_stop_function_trace(void)
210 204
211static int func_set_flag(u32 old_flags, u32 bit, int set) 205static int func_set_flag(u32 old_flags, u32 bit, int set)
212{ 206{
213 switch (bit) { 207 if (bit == TRACE_FUNC_OPT_STACK) {
214 case TRACE_FUNC_OPT_STACK:
215 /* do nothing if already set */ 208 /* do nothing if already set */
216 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 209 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
217 break; 210 return 0;
218 211
219 if (set) { 212 if (set) {
220 unregister_ftrace_function(&trace_ops); 213 unregister_ftrace_function(&trace_ops);
@@ -224,12 +217,10 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
224 register_ftrace_function(&trace_ops); 217 register_ftrace_function(&trace_ops);
225 } 218 }
226 219
227 break; 220 return 0;
228 default:
229 return -EINVAL;
230 } 221 }
231 222
232 return 0; 223 return -EINVAL;
233} 224}
234 225
235static struct tracer function_trace __read_mostly = 226static struct tracer function_trace __read_mostly =
@@ -366,7 +357,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
366 * We use the callback data field (which is a pointer) 357 * We use the callback data field (which is a pointer)
367 * as our counter. 358 * as our counter.
368 */ 359 */
369 ret = kstrtoul(number, 0, (unsigned long *)&count); 360 ret = strict_strtoul(number, 0, (unsigned long *)&count);
370 if (ret) 361 if (ret)
371 return ret; 362 return ret;
372 363
@@ -411,4 +402,5 @@ static __init int init_function_trace(void)
411 init_func_cmd_traceon(); 402 init_func_cmd_traceon();
412 return register_tracer(&function_trace); 403 return register_tracer(&function_trace);
413} 404}
414core_initcall(init_function_trace); 405device_initcall(init_function_trace);
406
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7..a7d2a4c653d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
143 return; 143 return;
144 } 144 }
145 145
146#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) 146#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
147 /* 147 /*
148 * The arch may choose to record the frame pointer used 148 * The arch may choose to record the frame pointer used
149 * and check it here to make sure that it is what we expect it 149 * and check it here to make sure that it is what we expect it
@@ -154,9 +154,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
154 * 154 *
155 * Currently, x86_32 with optimize for size (-Os) makes the latest 155 * Currently, x86_32 with optimize for size (-Os) makes the latest
156 * gcc do the above. 156 * gcc do the above.
157 *
158 * Note, -mfentry does not use frame pointers, and this test
159 * is not needed if CC_USING_FENTRY is set.
160 */ 157 */
161 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 158 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
162 ftrace_graph_stop(); 159 ftrace_graph_stop();
@@ -223,7 +220,7 @@ int __trace_graph_entry(struct trace_array *tr,
223 entry = ring_buffer_event_data(event); 220 entry = ring_buffer_event_data(event);
224 entry->graph_ent = *trace; 221 entry->graph_ent = *trace;
225 if (!filter_current_check_discard(buffer, call, entry, event)) 222 if (!filter_current_check_discard(buffer, call, entry, event))
226 __buffer_unlock_commit(buffer, event); 223 ring_buffer_unlock_commit(buffer, event);
227 224
228 return 1; 225 return 1;
229} 226}
@@ -327,7 +324,7 @@ void __trace_graph_return(struct trace_array *tr,
327 entry = ring_buffer_event_data(event); 324 entry = ring_buffer_event_data(event);
328 entry->ret = *trace; 325 entry->ret = *trace;
329 if (!filter_current_check_discard(buffer, call, entry, event)) 326 if (!filter_current_check_discard(buffer, call, entry, event))
330 __buffer_unlock_commit(buffer, event); 327 ring_buffer_unlock_commit(buffer, event);
331} 328}
332 329
333void trace_graph_return(struct ftrace_graph_ret *trace) 330void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -541,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
541 next = &data->ret; 538 next = &data->ret;
542 } else { 539 } else {
543 540
544 ring_iter = trace_buffer_iter(iter, iter->cpu); 541 ring_iter = iter->buffer_iter[iter->cpu];
545 542
546 /* First peek to compare current entry and the next one */ 543 /* First peek to compare current entry and the next one */
547 if (ring_iter) 544 if (ring_iter)
@@ -1474,4 +1471,4 @@ static __init int init_graph_trace(void)
1474 return register_tracer(&graph_trace); 1471 return register_tracer(&graph_trace);
1475} 1472}
1476 1473
1477core_initcall(init_graph_trace); 1474device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac488..667aa8cc0cf 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
7 * From code in the latency_tracer, that is: 7 * From code in the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly;
23 23
24static DEFINE_PER_CPU(int, tracing_cpu); 24static DEFINE_PER_CPU(int, tracing_cpu);
25 25
26static DEFINE_RAW_SPINLOCK(max_trace_lock); 26static DEFINE_SPINLOCK(max_trace_lock);
27 27
28enum { 28enum {
29 TRACER_IRQS_OFF = (1 << 1), 29 TRACER_IRQS_OFF = (1 << 1),
@@ -136,8 +136,7 @@ static int func_prolog_dec(struct trace_array *tr,
136 * irqsoff uses its own tracer function to keep the overhead down: 136 * irqsoff uses its own tracer function to keep the overhead down:
137 */ 137 */
138static void 138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, 139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140 struct ftrace_ops *op, struct pt_regs *pt_regs)
141{ 140{
142 struct trace_array *tr = irqsoff_trace; 141 struct trace_array *tr = irqsoff_trace;
143 struct trace_array_cpu *data; 142 struct trace_array_cpu *data;
@@ -154,7 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
154static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
155{ 154{
156 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
157 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 156 .flags = FTRACE_OPS_FL_GLOBAL,
158}; 157};
159#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
160 159
@@ -281,20 +280,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
281} 280}
282 281
283static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 284static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 285static void irqsoff_trace_close(struct trace_iterator *iter) { }
286
287#ifdef CONFIG_FUNCTION_TRACER
288static void irqsoff_print_header(struct seq_file *s)
289{
290 trace_default_header(s);
291}
292#else
293static void irqsoff_print_header(struct seq_file *s)
294{
295 trace_latency_header(s);
296}
297#endif /* CONFIG_FUNCTION_TRACER */
298#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
299 287
300/* 288/*
@@ -333,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
333 if (!report_latency(delta)) 321 if (!report_latency(delta))
334 goto out; 322 goto out;
335 323
336 raw_spin_lock_irqsave(&max_trace_lock, flags); 324 spin_lock_irqsave(&max_trace_lock, flags);
337 325
338 /* check if we are still the max latency */ 326 /* check if we are still the max latency */
339 if (!report_latency(delta)) 327 if (!report_latency(delta))
@@ -356,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
356 max_sequence++; 344 max_sequence++;
357 345
358out_unlock: 346out_unlock:
359 raw_spin_unlock_irqrestore(&max_trace_lock, flags); 347 spin_unlock_irqrestore(&max_trace_lock, flags);
360 348
361out: 349out:
362 data->critical_sequence = max_sequence; 350 data->critical_sequence = max_sequence;
@@ -517,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
517#ifdef CONFIG_PREEMPT_TRACER 505#ifdef CONFIG_PREEMPT_TRACER
518void trace_preempt_on(unsigned long a0, unsigned long a1) 506void trace_preempt_on(unsigned long a0, unsigned long a1)
519{ 507{
520 if (preempt_trace() && !irq_trace()) 508 if (preempt_trace())
521 stop_critical_timing(a0, a1); 509 stop_critical_timing(a0, a1);
522} 510}
523 511
524void trace_preempt_off(unsigned long a0, unsigned long a1) 512void trace_preempt_off(unsigned long a0, unsigned long a1)
525{ 513{
526 if (preempt_trace() && !irq_trace()) 514 if (preempt_trace())
527 start_critical_timing(a0, a1); 515 start_critical_timing(a0, a1);
528} 516}
529#endif /* CONFIG_PREEMPT_TRACER */ 517#endif /* CONFIG_PREEMPT_TRACER */
@@ -604,7 +592,7 @@ static struct tracer irqsoff_tracer __read_mostly =
604 .reset = irqsoff_tracer_reset, 592 .reset = irqsoff_tracer_reset,
605 .start = irqsoff_tracer_start, 593 .start = irqsoff_tracer_start,
606 .stop = irqsoff_tracer_stop, 594 .stop = irqsoff_tracer_stop,
607 .print_max = true, 595 .print_max = 1,
608 .print_header = irqsoff_print_header, 596 .print_header = irqsoff_print_header,
609 .print_line = irqsoff_print_line, 597 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 598 .flags = &tracer_flags,
@@ -614,7 +602,7 @@ static struct tracer irqsoff_tracer __read_mostly =
614#endif 602#endif
615 .open = irqsoff_trace_open, 603 .open = irqsoff_trace_open,
616 .close = irqsoff_trace_close, 604 .close = irqsoff_trace_close,
617 .use_max_tr = true, 605 .use_max_tr = 1,
618}; 606};
619# define register_irqsoff(trace) register_tracer(&trace) 607# define register_irqsoff(trace) register_tracer(&trace)
620#else 608#else
@@ -637,7 +625,7 @@ static struct tracer preemptoff_tracer __read_mostly =
637 .reset = irqsoff_tracer_reset, 625 .reset = irqsoff_tracer_reset,
638 .start = irqsoff_tracer_start, 626 .start = irqsoff_tracer_start,
639 .stop = irqsoff_tracer_stop, 627 .stop = irqsoff_tracer_stop,
640 .print_max = true, 628 .print_max = 1,
641 .print_header = irqsoff_print_header, 629 .print_header = irqsoff_print_header,
642 .print_line = irqsoff_print_line, 630 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 631 .flags = &tracer_flags,
@@ -647,7 +635,7 @@ static struct tracer preemptoff_tracer __read_mostly =
647#endif 635#endif
648 .open = irqsoff_trace_open, 636 .open = irqsoff_trace_open,
649 .close = irqsoff_trace_close, 637 .close = irqsoff_trace_close,
650 .use_max_tr = true, 638 .use_max_tr = 1,
651}; 639};
652# define register_preemptoff(trace) register_tracer(&trace) 640# define register_preemptoff(trace) register_tracer(&trace)
653#else 641#else
@@ -672,7 +660,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
672 .reset = irqsoff_tracer_reset, 660 .reset = irqsoff_tracer_reset,
673 .start = irqsoff_tracer_start, 661 .start = irqsoff_tracer_start,
674 .stop = irqsoff_tracer_stop, 662 .stop = irqsoff_tracer_stop,
675 .print_max = true, 663 .print_max = 1,
676 .print_header = irqsoff_print_header, 664 .print_header = irqsoff_print_header,
677 .print_line = irqsoff_print_line, 665 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 666 .flags = &tracer_flags,
@@ -682,7 +670,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
682#endif 670#endif
683 .open = irqsoff_trace_open, 671 .open = irqsoff_trace_open,
684 .close = irqsoff_trace_close, 672 .close = irqsoff_trace_close,
685 .use_max_tr = true, 673 .use_max_tr = 1,
686}; 674};
687 675
688# define register_preemptirqsoff(trace) register_tracer(&trace) 676# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +686,4 @@ __init static int init_irqsoff_tracer(void)
698 686
699 return 0; 687 return 0;
700} 688}
701core_initcall(init_irqsoff_tracer); 689device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1865d5f7653..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,15 +19,547 @@
19 19
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <asm/bitsperlong.h>
35
36#include "trace.h"
37#include "trace_output.h"
38
39#define MAX_TRACE_ARGS 128
40#define MAX_ARGSTR_LEN 63
41#define MAX_EVENT_NAME_LEN 64
42#define MAX_STRING_SIZE PATH_MAX
43#define KPROBE_EVENT_SYSTEM "kprobes"
22 44
23#include "trace_probe.h" 45/* Reserved field names */
46#define FIELD_STRING_IP "__probe_ip"
47#define FIELD_STRING_RETIP "__probe_ret_ip"
48#define FIELD_STRING_FUNC "__probe_func"
49
50const char *reserved_field_names[] = {
51 "common_type",
52 "common_flags",
53 "common_preempt_count",
54 "common_pid",
55 "common_tgid",
56 FIELD_STRING_IP,
57 FIELD_STRING_RETIP,
58 FIELD_STRING_FUNC,
59};
24 60
25#define KPROBE_EVENT_SYSTEM "kprobes" 61/* Printing function type */
62typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
63 void *);
64#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
65#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
66
67/* Printing in basic type function template */
68#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
69static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
70 const char *name, \
71 void *data, void *ent)\
72{ \
73 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
74} \
75static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
76
77DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
78DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
80DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
82DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
83DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
84DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
85
86/* data_rloc: data relative location, compatible with u32 */
87#define make_data_rloc(len, roffs) \
88 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
89#define get_rloc_len(dl) ((u32)(dl) >> 16)
90#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
91
92static inline void *get_rloc_data(u32 *dl)
93{
94 return (u8 *)dl + get_rloc_offs(*dl);
95}
96
97/* For data_loc conversion */
98static inline void *get_loc_data(u32 *dl, void *ent)
99{
100 return (u8 *)ent + get_rloc_offs(*dl);
101}
102
103/*
104 * Convert data_rloc to data_loc:
105 * data_rloc stores the offset from data_rloc itself, but data_loc
106 * stores the offset from event entry.
107 */
108#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
109
110/* For defining macros, define string/string_size types */
111typedef u32 string;
112typedef u32 string_size;
113
114/* Print type function for string type */
115static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
116 const char *name,
117 void *data, void *ent)
118{
119 int len = *(u32 *)data >> 16;
120
121 if (!len)
122 return trace_seq_printf(s, " %s=(fault)", name);
123 else
124 return trace_seq_printf(s, " %s=\"%s\"", name,
125 (const char *)get_loc_data(data, ent));
126}
127static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
128
129/* Data fetch function type */
130typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
131
132struct fetch_param {
133 fetch_func_t fn;
134 void *data;
135};
136
137static __kprobes void call_fetch(struct fetch_param *fprm,
138 struct pt_regs *regs, void *dest)
139{
140 return fprm->fn(regs, fprm->data, dest);
141}
142
143#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
144/*
145 * Define macro for basic types - we don't need to define s* types, because
146 * we have to care only about bitwidth at recording time.
147 */
148#define DEFINE_BASIC_FETCH_FUNCS(method) \
149DEFINE_FETCH_##method(u8) \
150DEFINE_FETCH_##method(u16) \
151DEFINE_FETCH_##method(u32) \
152DEFINE_FETCH_##method(u64)
153
154#define CHECK_FETCH_FUNCS(method, fn) \
155 (((FETCH_FUNC_NAME(method, u8) == fn) || \
156 (FETCH_FUNC_NAME(method, u16) == fn) || \
157 (FETCH_FUNC_NAME(method, u32) == fn) || \
158 (FETCH_FUNC_NAME(method, u64) == fn) || \
159 (FETCH_FUNC_NAME(method, string) == fn) || \
160 (FETCH_FUNC_NAME(method, string_size) == fn)) \
161 && (fn != NULL))
162
163/* Data fetch function templates */
164#define DEFINE_FETCH_reg(type) \
165static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
166 void *offset, void *dest) \
167{ \
168 *(type *)dest = (type)regs_get_register(regs, \
169 (unsigned int)((unsigned long)offset)); \
170}
171DEFINE_BASIC_FETCH_FUNCS(reg)
172/* No string on the register */
173#define fetch_reg_string NULL
174#define fetch_reg_string_size NULL
175
176#define DEFINE_FETCH_stack(type) \
177static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
178 void *offset, void *dest) \
179{ \
180 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
181 (unsigned int)((unsigned long)offset)); \
182}
183DEFINE_BASIC_FETCH_FUNCS(stack)
184/* No string on the stack entry */
185#define fetch_stack_string NULL
186#define fetch_stack_string_size NULL
187
188#define DEFINE_FETCH_retval(type) \
189static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
190 void *dummy, void *dest) \
191{ \
192 *(type *)dest = (type)regs_return_value(regs); \
193}
194DEFINE_BASIC_FETCH_FUNCS(retval)
195/* No string on the retval */
196#define fetch_retval_string NULL
197#define fetch_retval_string_size NULL
198
199#define DEFINE_FETCH_memory(type) \
200static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
201 void *addr, void *dest) \
202{ \
203 type retval; \
204 if (probe_kernel_address(addr, retval)) \
205 *(type *)dest = 0; \
206 else \
207 *(type *)dest = retval; \
208}
209DEFINE_BASIC_FETCH_FUNCS(memory)
210/*
211 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
212 * length and relative data location.
213 */
214static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
215 void *addr, void *dest)
216{
217 long ret;
218 int maxlen = get_rloc_len(*(u32 *)dest);
219 u8 *dst = get_rloc_data(dest);
220 u8 *src = addr;
221 mm_segment_t old_fs = get_fs();
222 if (!maxlen)
223 return;
224 /*
225 * Try to get string again, since the string can be changed while
226 * probing.
227 */
228 set_fs(KERNEL_DS);
229 pagefault_disable();
230 do
231 ret = __copy_from_user_inatomic(dst++, src++, 1);
232 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
233 dst[-1] = '\0';
234 pagefault_enable();
235 set_fs(old_fs);
236
237 if (ret < 0) { /* Failed to fetch string */
238 ((u8 *)get_rloc_data(dest))[0] = '\0';
239 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
240 } else
241 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
242 get_rloc_offs(*(u32 *)dest));
243}
244/* Return the length of string -- including null terminal byte */
245static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
246 void *addr, void *dest)
247{
248 int ret, len = 0;
249 u8 c;
250 mm_segment_t old_fs = get_fs();
251
252 set_fs(KERNEL_DS);
253 pagefault_disable();
254 do {
255 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
256 len++;
257 } while (c && ret == 0 && len < MAX_STRING_SIZE);
258 pagefault_enable();
259 set_fs(old_fs);
260
261 if (ret < 0) /* Failed to check the length */
262 *(u32 *)dest = 0;
263 else
264 *(u32 *)dest = len;
265}
266
267/* Memory fetching by symbol */
268struct symbol_cache {
269 char *symbol;
270 long offset;
271 unsigned long addr;
272};
273
274static unsigned long update_symbol_cache(struct symbol_cache *sc)
275{
276 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
277 if (sc->addr)
278 sc->addr += sc->offset;
279 return sc->addr;
280}
281
282static void free_symbol_cache(struct symbol_cache *sc)
283{
284 kfree(sc->symbol);
285 kfree(sc);
286}
287
288static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
289{
290 struct symbol_cache *sc;
291
292 if (!sym || strlen(sym) == 0)
293 return NULL;
294 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
295 if (!sc)
296 return NULL;
297
298 sc->symbol = kstrdup(sym, GFP_KERNEL);
299 if (!sc->symbol) {
300 kfree(sc);
301 return NULL;
302 }
303 sc->offset = offset;
304
305 update_symbol_cache(sc);
306 return sc;
307}
308
309#define DEFINE_FETCH_symbol(type) \
310static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
311 void *data, void *dest) \
312{ \
313 struct symbol_cache *sc = data; \
314 if (sc->addr) \
315 fetch_memory_##type(regs, (void *)sc->addr, dest); \
316 else \
317 *(type *)dest = 0; \
318}
319DEFINE_BASIC_FETCH_FUNCS(symbol)
320DEFINE_FETCH_symbol(string)
321DEFINE_FETCH_symbol(string_size)
322
323/* Dereference memory access function */
324struct deref_fetch_param {
325 struct fetch_param orig;
326 long offset;
327};
328
329#define DEFINE_FETCH_deref(type) \
330static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
331 void *data, void *dest) \
332{ \
333 struct deref_fetch_param *dprm = data; \
334 unsigned long addr; \
335 call_fetch(&dprm->orig, regs, &addr); \
336 if (addr) { \
337 addr += dprm->offset; \
338 fetch_memory_##type(regs, (void *)addr, dest); \
339 } else \
340 *(type *)dest = 0; \
341}
342DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size)
345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
355{
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 free_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 free_symbol_cache(data->orig.data);
360 kfree(data);
361}
362
363/* Bitfield fetch function */
364struct bitfield_fetch_param {
365 struct fetch_param orig;
366 unsigned char hi_shift;
367 unsigned char low_shift;
368};
369
370#define DEFINE_FETCH_bitfield(type) \
371static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
372 void *data, void *dest) \
373{ \
374 struct bitfield_fetch_param *bprm = data; \
375 type buf = 0; \
376 call_fetch(&bprm->orig, regs, &buf); \
377 if (buf) { \
378 buf <<= bprm->hi_shift; \
379 buf >>= bprm->low_shift; \
380 } \
381 *(type *)dest = buf; \
382}
383DEFINE_BASIC_FETCH_FUNCS(bitfield)
384#define fetch_bitfield_string NULL
385#define fetch_bitfield_string_size NULL
386
387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
402{
403 /*
404 * Don't check the bitfield itself, because this must be the
405 * last fetch function.
406 */
407 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
408 free_deref_fetch_param(data->orig.data);
409 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
410 free_symbol_cache(data->orig.data);
411 kfree(data);
412}
413
414/* Default (unsigned long) fetch type */
415#define __DEFAULT_FETCH_TYPE(t) u##t
416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
417#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
418#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
419
420/* Fetch types */
421enum {
422 FETCH_MTD_reg = 0,
423 FETCH_MTD_stack,
424 FETCH_MTD_retval,
425 FETCH_MTD_memory,
426 FETCH_MTD_symbol,
427 FETCH_MTD_deref,
428 FETCH_MTD_bitfield,
429 FETCH_MTD_END,
430};
431
432#define ASSIGN_FETCH_FUNC(method, type) \
433 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
434
435#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
436 {.name = _name, \
437 .size = _size, \
438 .is_signed = sign, \
439 .print = PRINT_TYPE_FUNC_NAME(ptype), \
440 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
441 .fmttype = _fmttype, \
442 .fetch = { \
443ASSIGN_FETCH_FUNC(reg, ftype), \
444ASSIGN_FETCH_FUNC(stack, ftype), \
445ASSIGN_FETCH_FUNC(retval, ftype), \
446ASSIGN_FETCH_FUNC(memory, ftype), \
447ASSIGN_FETCH_FUNC(symbol, ftype), \
448ASSIGN_FETCH_FUNC(deref, ftype), \
449ASSIGN_FETCH_FUNC(bitfield, ftype), \
450 } \
451 }
452
453#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
454 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
455
456#define FETCH_TYPE_STRING 0
457#define FETCH_TYPE_STRSIZE 1
458
459/* Fetch type information table */
460static const struct fetch_type {
461 const char *name; /* Name of type */
462 size_t size; /* Byte size of type */
463 int is_signed; /* Signed flag */
464 print_type_func_t print; /* Print functions */
465 const char *fmt; /* Fromat string */
466 const char *fmttype; /* Name in format file */
467 /* Fetch functions */
468 fetch_func_t fetch[FETCH_MTD_END];
469} fetch_type_table[] = {
470 /* Special types */
471 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
472 sizeof(u32), 1, "__data_loc char[]"),
473 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
474 string_size, sizeof(u32), 0, "u32"),
475 /* Basic types */
476 ASSIGN_FETCH_TYPE(u8, u8, 0),
477 ASSIGN_FETCH_TYPE(u16, u16, 0),
478 ASSIGN_FETCH_TYPE(u32, u32, 0),
479 ASSIGN_FETCH_TYPE(u64, u64, 0),
480 ASSIGN_FETCH_TYPE(s8, u8, 1),
481 ASSIGN_FETCH_TYPE(s16, u16, 1),
482 ASSIGN_FETCH_TYPE(s32, u32, 1),
483 ASSIGN_FETCH_TYPE(s64, u64, 1),
484};
485
486static const struct fetch_type *find_fetch_type(const char *type)
487{
488 int i;
489
490 if (!type)
491 type = DEFAULT_FETCH_TYPE_STR;
492
493 /* Special case: bitfield */
494 if (*type == 'b') {
495 unsigned long bs;
496 type = strchr(type, '/');
497 if (!type)
498 goto fail;
499 type++;
500 if (strict_strtoul(type, 0, &bs))
501 goto fail;
502 switch (bs) {
503 case 8:
504 return find_fetch_type("u8");
505 case 16:
506 return find_fetch_type("u16");
507 case 32:
508 return find_fetch_type("u32");
509 case 64:
510 return find_fetch_type("u64");
511 default:
512 goto fail;
513 }
514 }
515
516 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
517 if (strcmp(type, fetch_type_table[i].name) == 0)
518 return &fetch_type_table[i];
519fail:
520 return NULL;
521}
522
523/* Special function : only accept unsigned long */
524static __kprobes void fetch_stack_address(struct pt_regs *regs,
525 void *dummy, void *dest)
526{
527 *(unsigned long *)dest = kernel_stack_pointer(regs);
528}
529
530static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
531 fetch_func_t orig_fn)
532{
533 int i;
534
535 if (type != &fetch_type_table[FETCH_TYPE_STRING])
536 return NULL; /* Only string type needs size function */
537 for (i = 0; i < FETCH_MTD_END; i++)
538 if (type->fetch[i] == orig_fn)
539 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
540
541 WARN_ON(1); /* This should not happen */
542 return NULL;
543}
26 544
27/** 545/**
28 * Kprobe event core functions 546 * Kprobe event core functions
29 */ 547 */
30 548
549struct probe_arg {
550 struct fetch_param fetch;
551 struct fetch_param fetch_size;
552 unsigned int offset; /* Offset from argument entry */
553 const char *name; /* Name of this argument */
554 const char *comm; /* Command of this argument */
555 const struct fetch_type *type; /* Type of this argument */
556};
557
558/* Flags for trace_probe */
559#define TP_FLAG_TRACE 1
560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
562
31struct trace_probe { 563struct trace_probe {
32 struct list_head list; 564 struct list_head list;
33 struct kretprobe rp; /* Use rp.kp for kprobe use */ 565 struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -99,6 +631,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
99static int kretprobe_dispatcher(struct kretprobe_instance *ri, 631static int kretprobe_dispatcher(struct kretprobe_instance *ri,
100 struct pt_regs *regs); 632 struct pt_regs *regs);
101 633
634/* Check the name is good for event/group/fields */
635static int is_good_name(const char *name)
636{
637 if (!isalpha(*name) && *name != '_')
638 return 0;
639 while (*++name != '\0') {
640 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
641 return 0;
642 }
643 return 1;
644}
645
102/* 646/*
103 * Allocate new trace_probe and initialize it (including kprobes). 647 * Allocate new trace_probe and initialize it (including kprobes).
104 */ 648 */
@@ -107,7 +651,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
107 void *addr, 651 void *addr,
108 const char *symbol, 652 const char *symbol,
109 unsigned long offs, 653 unsigned long offs,
110 int nargs, bool is_return) 654 int nargs, int is_return)
111{ 655{
112 struct trace_probe *tp; 656 struct trace_probe *tp;
113 int ret = -ENOMEM; 657 int ret = -ENOMEM;
@@ -158,12 +702,34 @@ error:
158 return ERR_PTR(ret); 702 return ERR_PTR(ret);
159} 703}
160 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
715static void free_probe_arg(struct probe_arg *arg)
716{
717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
718 free_bitfield_fetch_param(arg->fetch.data);
719 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
720 free_deref_fetch_param(arg->fetch.data);
721 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
722 free_symbol_cache(arg->fetch.data);
723 kfree(arg->name);
724 kfree(arg->comm);
725}
726
161static void free_trace_probe(struct trace_probe *tp) 727static void free_trace_probe(struct trace_probe *tp)
162{ 728{
163 int i; 729 int i;
164 730
165 for (i = 0; i < tp->nr_args; i++) 731 for (i = 0; i < tp->nr_args; i++)
166 traceprobe_free_probe_arg(&tp->args[i]); 732 free_probe_arg(&tp->args[i]);
167 733
168 kfree(tp->call.class->system); 734 kfree(tp->call.class->system);
169 kfree(tp->call.name); 735 kfree(tp->call.name);
@@ -221,7 +787,7 @@ static int __register_trace_probe(struct trace_probe *tp)
221 return -EINVAL; 787 return -EINVAL;
222 788
223 for (i = 0; i < tp->nr_args; i++) 789 for (i = 0; i < tp->nr_args; i++)
224 traceprobe_update_arg(&tp->args[i]); 790 update_probe_arg(&tp->args[i]);
225 791
226 /* Set/clear disabled flag according to tp->flag */ 792 /* Set/clear disabled flag according to tp->flag */
227 if (trace_probe_is_enabled(tp)) 793 if (trace_probe_is_enabled(tp))
@@ -353,6 +919,227 @@ static struct notifier_block trace_probe_module_nb = {
353 .priority = 1 /* Invoked after kprobe module callback */ 919 .priority = 1 /* Invoked after kprobe module callback */
354}; 920};
355 921
922/* Split symbol and offset. */
923static int split_symbol_offset(char *symbol, unsigned long *offset)
924{
925 char *tmp;
926 int ret;
927
928 if (!offset)
929 return -EINVAL;
930
931 tmp = strchr(symbol, '+');
932 if (tmp) {
933 /* skip sign because strict_strtol doesn't accept '+' */
934 ret = strict_strtoul(tmp + 1, 0, offset);
935 if (ret)
936 return ret;
937 *tmp = '\0';
938 } else
939 *offset = 0;
940 return 0;
941}
942
943#define PARAM_MAX_ARGS 16
944#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
945
946static int parse_probe_vars(char *arg, const struct fetch_type *t,
947 struct fetch_param *f, int is_return)
948{
949 int ret = 0;
950 unsigned long param;
951
952 if (strcmp(arg, "retval") == 0) {
953 if (is_return)
954 f->fn = t->fetch[FETCH_MTD_retval];
955 else
956 ret = -EINVAL;
957 } else if (strncmp(arg, "stack", 5) == 0) {
958 if (arg[5] == '\0') {
959 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
960 f->fn = fetch_stack_address;
961 else
962 ret = -EINVAL;
963 } else if (isdigit(arg[5])) {
964 ret = strict_strtoul(arg + 5, 10, &param);
965 if (ret || param > PARAM_MAX_STACK)
966 ret = -EINVAL;
967 else {
968 f->fn = t->fetch[FETCH_MTD_stack];
969 f->data = (void *)param;
970 }
971 } else
972 ret = -EINVAL;
973 } else
974 ret = -EINVAL;
975 return ret;
976}
977
978/* Recursive argument parser */
979static int __parse_probe_arg(char *arg, const struct fetch_type *t,
980 struct fetch_param *f, int is_return)
981{
982 int ret = 0;
983 unsigned long param;
984 long offset;
985 char *tmp;
986
987 switch (arg[0]) {
988 case '$':
989 ret = parse_probe_vars(arg + 1, t, f, is_return);
990 break;
991 case '%': /* named register */
992 ret = regs_query_register_offset(arg + 1);
993 if (ret >= 0) {
994 f->fn = t->fetch[FETCH_MTD_reg];
995 f->data = (void *)(unsigned long)ret;
996 ret = 0;
997 }
998 break;
999 case '@': /* memory or symbol */
1000 if (isdigit(arg[1])) {
1001 ret = strict_strtoul(arg + 1, 0, &param);
1002 if (ret)
1003 break;
1004 f->fn = t->fetch[FETCH_MTD_memory];
1005 f->data = (void *)param;
1006 } else {
1007 ret = split_symbol_offset(arg + 1, &offset);
1008 if (ret)
1009 break;
1010 f->data = alloc_symbol_cache(arg + 1, offset);
1011 if (f->data)
1012 f->fn = t->fetch[FETCH_MTD_symbol];
1013 }
1014 break;
1015 case '+': /* deref memory */
1016 arg++; /* Skip '+', because strict_strtol() rejects it. */
1017 case '-':
1018 tmp = strchr(arg, '(');
1019 if (!tmp)
1020 break;
1021 *tmp = '\0';
1022 ret = strict_strtol(arg, 0, &offset);
1023 if (ret)
1024 break;
1025 arg = tmp + 1;
1026 tmp = strrchr(arg, ')');
1027 if (tmp) {
1028 struct deref_fetch_param *dprm;
1029 const struct fetch_type *t2 = find_fetch_type(NULL);
1030 *tmp = '\0';
1031 dprm = kzalloc(sizeof(struct deref_fetch_param),
1032 GFP_KERNEL);
1033 if (!dprm)
1034 return -ENOMEM;
1035 dprm->offset = offset;
1036 ret = __parse_probe_arg(arg, t2, &dprm->orig,
1037 is_return);
1038 if (ret)
1039 kfree(dprm);
1040 else {
1041 f->fn = t->fetch[FETCH_MTD_deref];
1042 f->data = (void *)dprm;
1043 }
1044 }
1045 break;
1046 }
1047 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
1048 pr_info("%s type has no corresponding fetch method.\n",
1049 t->name);
1050 ret = -EINVAL;
1051 }
1052 return ret;
1053}
1054
1055#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
1056
1057/* Bitfield type needs to be parsed into a fetch function */
1058static int __parse_bitfield_probe_arg(const char *bf,
1059 const struct fetch_type *t,
1060 struct fetch_param *f)
1061{
1062 struct bitfield_fetch_param *bprm;
1063 unsigned long bw, bo;
1064 char *tail;
1065
1066 if (*bf != 'b')
1067 return 0;
1068
1069 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1070 if (!bprm)
1071 return -ENOMEM;
1072 bprm->orig = *f;
1073 f->fn = t->fetch[FETCH_MTD_bitfield];
1074 f->data = (void *)bprm;
1075
1076 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
1077 if (bw == 0 || *tail != '@')
1078 return -EINVAL;
1079
1080 bf = tail + 1;
1081 bo = simple_strtoul(bf, &tail, 0);
1082 if (tail == bf || *tail != '/')
1083 return -EINVAL;
1084
1085 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
1086 bprm->low_shift = bprm->hi_shift + bo;
1087 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
1088}
1089
1090/* String length checking wrapper */
1091static int parse_probe_arg(char *arg, struct trace_probe *tp,
1092 struct probe_arg *parg, int is_return)
1093{
1094 const char *t;
1095 int ret;
1096
1097 if (strlen(arg) > MAX_ARGSTR_LEN) {
1098 pr_info("Argument is too long.: %s\n", arg);
1099 return -ENOSPC;
1100 }
1101 parg->comm = kstrdup(arg, GFP_KERNEL);
1102 if (!parg->comm) {
1103 pr_info("Failed to allocate memory for command '%s'.\n", arg);
1104 return -ENOMEM;
1105 }
1106 t = strchr(parg->comm, ':');
1107 if (t) {
1108 arg[t - parg->comm] = '\0';
1109 t++;
1110 }
1111 parg->type = find_fetch_type(t);
1112 if (!parg->type) {
1113 pr_info("Unsupported type: %s\n", t);
1114 return -EINVAL;
1115 }
1116 parg->offset = tp->size;
1117 tp->size += parg->type->size;
1118 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
1119 if (ret >= 0 && t != NULL)
1120 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
1121 if (ret >= 0) {
1122 parg->fetch_size.fn = get_fetch_size_function(parg->type,
1123 parg->fetch.fn);
1124 parg->fetch_size.data = parg->fetch.data;
1125 }
1126 return ret;
1127}
1128
1129/* Return 1 if name is reserved or already used by another argument */
1130static int conflict_field_name(const char *name,
1131 struct probe_arg *args, int narg)
1132{
1133 int i;
1134 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
1135 if (strcmp(reserved_field_names[i], name) == 0)
1136 return 1;
1137 for (i = 0; i < narg; i++)
1138 if (strcmp(args[i].name, name) == 0)
1139 return 1;
1140 return 0;
1141}
1142
356static int create_trace_probe(int argc, char **argv) 1143static int create_trace_probe(int argc, char **argv)
357{ 1144{
358 /* 1145 /*
@@ -375,7 +1162,7 @@ static int create_trace_probe(int argc, char **argv)
375 */ 1162 */
376 struct trace_probe *tp; 1163 struct trace_probe *tp;
377 int i, ret = 0; 1164 int i, ret = 0;
378 bool is_return = false, is_delete = false; 1165 int is_return = 0, is_delete = 0;
379 char *symbol = NULL, *event = NULL, *group = NULL; 1166 char *symbol = NULL, *event = NULL, *group = NULL;
380 char *arg; 1167 char *arg;
381 unsigned long offset = 0; 1168 unsigned long offset = 0;
@@ -384,11 +1171,11 @@ static int create_trace_probe(int argc, char **argv)
384 1171
385 /* argc must be >= 1 */ 1172 /* argc must be >= 1 */
386 if (argv[0][0] == 'p') 1173 if (argv[0][0] == 'p')
387 is_return = false; 1174 is_return = 0;
388 else if (argv[0][0] == 'r') 1175 else if (argv[0][0] == 'r')
389 is_return = true; 1176 is_return = 1;
390 else if (argv[0][0] == '-') 1177 else if (argv[0][0] == '-')
391 is_delete = true; 1178 is_delete = 1;
392 else { 1179 else {
393 pr_info("Probe definition must be started with 'p', 'r' or" 1180 pr_info("Probe definition must be started with 'p', 'r' or"
394 " '-'.\n"); 1181 " '-'.\n");
@@ -444,7 +1231,7 @@ static int create_trace_probe(int argc, char **argv)
444 return -EINVAL; 1231 return -EINVAL;
445 } 1232 }
446 /* an address specified */ 1233 /* an address specified */
447 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); 1234 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
448 if (ret) { 1235 if (ret) {
449 pr_info("Failed to parse address.\n"); 1236 pr_info("Failed to parse address.\n");
450 return ret; 1237 return ret;
@@ -453,7 +1240,7 @@ static int create_trace_probe(int argc, char **argv)
453 /* a symbol specified */ 1240 /* a symbol specified */
454 symbol = argv[1]; 1241 symbol = argv[1];
455 /* TODO: support .init module functions */ 1242 /* TODO: support .init module functions */
456 ret = traceprobe_split_symbol_offset(symbol, &offset); 1243 ret = split_symbol_offset(symbol, &offset);
457 if (ret) { 1244 if (ret) {
458 pr_info("Failed to parse symbol.\n"); 1245 pr_info("Failed to parse symbol.\n");
459 return ret; 1246 return ret;
@@ -515,8 +1302,7 @@ static int create_trace_probe(int argc, char **argv)
515 goto error; 1302 goto error;
516 } 1303 }
517 1304
518 if (traceprobe_conflict_field_name(tp->args[i].name, 1305 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
519 tp->args, i)) {
520 pr_info("Argument[%d] name '%s' conflicts with " 1306 pr_info("Argument[%d] name '%s' conflicts with "
521 "another field.\n", i, argv[i]); 1307 "another field.\n", i, argv[i]);
522 ret = -EINVAL; 1308 ret = -EINVAL;
@@ -524,8 +1310,7 @@ static int create_trace_probe(int argc, char **argv)
524 } 1310 }
525 1311
526 /* Parse fetch argument */ 1312 /* Parse fetch argument */
527 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], 1313 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
528 is_return, true);
529 if (ret) { 1314 if (ret) {
530 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 1315 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
531 goto error; 1316 goto error;
@@ -627,11 +1412,70 @@ static int probes_open(struct inode *inode, struct file *file)
627 return seq_open(file, &probes_seq_op); 1412 return seq_open(file, &probes_seq_op);
628} 1413}
629 1414
1415static int command_trace_probe(const char *buf)
1416{
1417 char **argv;
1418 int argc = 0, ret = 0;
1419
1420 argv = argv_split(GFP_KERNEL, buf, &argc);
1421 if (!argv)
1422 return -ENOMEM;
1423
1424 if (argc)
1425 ret = create_trace_probe(argc, argv);
1426
1427 argv_free(argv);
1428 return ret;
1429}
1430
1431#define WRITE_BUFSIZE 4096
1432
630static ssize_t probes_write(struct file *file, const char __user *buffer, 1433static ssize_t probes_write(struct file *file, const char __user *buffer,
631 size_t count, loff_t *ppos) 1434 size_t count, loff_t *ppos)
632{ 1435{
633 return traceprobe_probes_write(file, buffer, count, ppos, 1436 char *kbuf, *tmp;
634 create_trace_probe); 1437 int ret;
1438 size_t done;
1439 size_t size;
1440
1441 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
1442 if (!kbuf)
1443 return -ENOMEM;
1444
1445 ret = done = 0;
1446 while (done < count) {
1447 size = count - done;
1448 if (size >= WRITE_BUFSIZE)
1449 size = WRITE_BUFSIZE - 1;
1450 if (copy_from_user(kbuf, buffer + done, size)) {
1451 ret = -EFAULT;
1452 goto out;
1453 }
1454 kbuf[size] = '\0';
1455 tmp = strchr(kbuf, '\n');
1456 if (tmp) {
1457 *tmp = '\0';
1458 size = tmp - kbuf + 1;
1459 } else if (done + size < count) {
1460 pr_warning("Line length is too long: "
1461 "Should be less than %d.", WRITE_BUFSIZE);
1462 ret = -EINVAL;
1463 goto out;
1464 }
1465 done += size;
1466 /* Remove comments */
1467 tmp = strchr(kbuf, '#');
1468 if (tmp)
1469 *tmp = '\0';
1470
1471 ret = command_trace_probe(kbuf);
1472 if (ret)
1473 goto out;
1474 }
1475 ret = done;
1476out:
1477 kfree(kbuf);
1478 return ret;
635} 1479}
636 1480
637static const struct file_operations kprobe_events_ops = { 1481static const struct file_operations kprobe_events_ops = {
@@ -751,8 +1595,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1595 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 1596
753 if (!filter_current_check_discard(buffer, call, entry, event)) 1597 if (!filter_current_check_discard(buffer, call, entry, event))
754 trace_buffer_unlock_commit_regs(buffer, event, 1598 trace_nowake_buffer_unlock_commit_regs(buffer, event,
755 irq_flags, pc, regs); 1599 irq_flags, pc, regs);
756} 1600}
757 1601
758/* Kretprobe handler */ 1602/* Kretprobe handler */
@@ -784,8 +1628,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1628 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
785 1629
786 if (!filter_current_check_discard(buffer, call, entry, event)) 1630 if (!filter_current_check_discard(buffer, call, entry, event))
787 trace_buffer_unlock_commit_regs(buffer, event, 1631 trace_nowake_buffer_unlock_commit_regs(buffer, event,
788 irq_flags, pc, regs); 1632 irq_flags, pc, regs);
789} 1633}
790 1634
791/* Event entry printers */ 1635/* Event entry printers */
@@ -867,6 +1711,16 @@ partial:
867 return TRACE_TYPE_PARTIAL_LINE; 1711 return TRACE_TYPE_PARTIAL_LINE;
868} 1712}
869 1713
1714#undef DEFINE_FIELD
1715#define DEFINE_FIELD(type, item, name, is_signed) \
1716 do { \
1717 ret = trace_define_field(event_call, #type, name, \
1718 offsetof(typeof(field), item), \
1719 sizeof(field.item), is_signed, \
1720 FILTER_OTHER); \
1721 if (ret) \
1722 return ret; \
1723 } while (0)
870 1724
871static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1725static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
872{ 1726{
@@ -1002,8 +1856,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1856 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1003 1857
1004 head = this_cpu_ptr(call->perf_events); 1858 head = this_cpu_ptr(call->perf_events);
1005 perf_trace_buf_submit(entry, size, rctx, 1859 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1006 entry->ip, 1, regs, head, NULL);
1007} 1860}
1008 1861
1009/* Kretprobe profile handler */ 1862/* Kretprobe profile handler */
@@ -1034,14 +1887,12 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1034 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1887 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1035 1888
1036 head = this_cpu_ptr(call->perf_events); 1889 head = this_cpu_ptr(call->perf_events);
1037 perf_trace_buf_submit(entry, size, rctx, 1890 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1038 entry->ret_ip, 1, regs, head, NULL);
1039} 1891}
1040#endif /* CONFIG_PERF_EVENTS */ 1892#endif /* CONFIG_PERF_EVENTS */
1041 1893
1042static __kprobes 1894static __kprobes
1043int kprobe_register(struct ftrace_event_call *event, 1895int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1044 enum trace_reg type, void *data)
1045{ 1896{
1046 struct trace_probe *tp = (struct trace_probe *)event->data; 1897 struct trace_probe *tp = (struct trace_probe *)event->data;
1047 1898
@@ -1058,11 +1909,6 @@ int kprobe_register(struct ftrace_event_call *event,
1058 case TRACE_REG_PERF_UNREGISTER: 1909 case TRACE_REG_PERF_UNREGISTER:
1059 disable_trace_probe(tp, TP_FLAG_PROFILE); 1910 disable_trace_probe(tp, TP_FLAG_PROFILE);
1060 return 0; 1911 return 0;
1061 case TRACE_REG_PERF_OPEN:
1062 case TRACE_REG_PERF_CLOSE:
1063 case TRACE_REG_PERF_ADD:
1064 case TRACE_REG_PERF_DEL:
1065 return 0;
1066#endif 1912#endif
1067 } 1913 }
1068 return 0; 1914 return 0;
@@ -1199,9 +2045,8 @@ static __init int kprobe_trace_self_tests_init(void)
1199 2045
1200 pr_info("Testing kprobe tracing: "); 2046 pr_info("Testing kprobe tracing: ");
1201 2047
1202 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " 2048 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1203 "$stack $stack0 +0($stack)", 2049 "$stack $stack0 +0($stack)");
1204 create_trace_probe);
1205 if (WARN_ON_ONCE(ret)) { 2050 if (WARN_ON_ONCE(ret)) {
1206 pr_warning("error on probing function entry.\n"); 2051 pr_warning("error on probing function entry.\n");
1207 warn++; 2052 warn++;
@@ -1215,8 +2060,8 @@ static __init int kprobe_trace_self_tests_init(void)
1215 enable_trace_probe(tp, TP_FLAG_TRACE); 2060 enable_trace_probe(tp, TP_FLAG_TRACE);
1216 } 2061 }
1217 2062
1218 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " 2063 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1219 "$retval", create_trace_probe); 2064 "$retval");
1220 if (WARN_ON_ONCE(ret)) { 2065 if (WARN_ON_ONCE(ret)) {
1221 pr_warning("error on probing function return.\n"); 2066 pr_warning("error on probing function return.\n");
1222 warn++; 2067 warn++;
@@ -1250,13 +2095,13 @@ static __init int kprobe_trace_self_tests_init(void)
1250 } else 2095 } else
1251 disable_trace_probe(tp, TP_FLAG_TRACE); 2096 disable_trace_probe(tp, TP_FLAG_TRACE);
1252 2097
1253 ret = traceprobe_command("-:testprobe", create_trace_probe); 2098 ret = command_trace_probe("-:testprobe");
1254 if (WARN_ON_ONCE(ret)) { 2099 if (WARN_ON_ONCE(ret)) {
1255 pr_warning("error on deleting a probe.\n"); 2100 pr_warning("error on deleting a probe.\n");
1256 warn++; 2101 warn++;
1257 } 2102 }
1258 2103
1259 ret = traceprobe_command("-:testprobe2", create_trace_probe); 2104 ret = command_trace_probe("-:testprobe2");
1260 if (WARN_ON_ONCE(ret)) { 2105 if (WARN_ON_ONCE(ret)) {
1261 pr_warning("error on deleting a probe.\n"); 2106 pr_warning("error on deleting a probe.\n");
1262 warn++; 2107 warn++;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc..51999309a6c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
264 return ret; 264 return ret;
265} 265}
266 266
267int trace_seq_path(struct trace_seq *s, const struct path *path) 267int trace_seq_path(struct trace_seq *s, struct path *path)
268{ 268{
269 unsigned char *p; 269 unsigned char *p;
270 270
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
300 unsigned long mask; 300 unsigned long mask;
301 const char *str; 301 const char *str;
302 const char *ret = p->buffer + p->len; 302 const char *ret = p->buffer + p->len;
303 int i, first = 1; 303 int i;
304 304
305 for (i = 0; flag_array[i].name && flags; i++) { 305 for (i = 0; flag_array[i].name && flags; i++) {
306 306
@@ -310,16 +310,14 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
310 310
311 str = flag_array[i].name; 311 str = flag_array[i].name;
312 flags &= ~mask; 312 flags &= ~mask;
313 if (!first && delim) 313 if (p->len && delim)
314 trace_seq_puts(p, delim); 314 trace_seq_puts(p, delim);
315 else
316 first = 0;
317 trace_seq_puts(p, str); 315 trace_seq_puts(p, str);
318 } 316 }
319 317
320 /* check for left over flags */ 318 /* check for left over flags */
321 if (flags) { 319 if (flags) {
322 if (!first && delim) 320 if (p->len && delim)
323 trace_seq_puts(p, delim); 321 trace_seq_puts(p, delim);
324 trace_seq_printf(p, "0x%lx", flags); 322 trace_seq_printf(p, "0x%lx", flags);
325 } 323 }
@@ -346,7 +344,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
346 break; 344 break;
347 } 345 }
348 346
349 if (ret == (const char *)(p->buffer + p->len)) 347 if (!p->len)
350 trace_seq_printf(p, "0x%lx", val); 348 trace_seq_printf(p, "0x%lx", val);
351 349
352 trace_seq_putc(p, 0); 350 trace_seq_putc(p, 0);
@@ -372,7 +370,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
372 break; 370 break;
373 } 371 }
374 372
375 if (ret == (const char *)(p->buffer + p->len)) 373 if (!p->len)
376 trace_seq_printf(p, "0x%llx", val); 374 trace_seq_printf(p, "0x%llx", val);
377 375
378 trace_seq_putc(p, 0); 376 trace_seq_putc(p, 0);
@@ -610,113 +608,68 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
610 return trace_print_lat_fmt(s, entry); 608 return trace_print_lat_fmt(s, entry);
611} 609}
612 610
613static unsigned long preempt_mark_thresh_us = 100; 611static unsigned long preempt_mark_thresh = 100;
614 612
615static int 613static int
616lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) 614lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
615 unsigned long rel_usecs)
617{ 616{
618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; 617 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; 618 rel_usecs > preempt_mark_thresh ? '!' :
620 unsigned long long abs_ts = iter->ts - iter->tr->time_start; 619 rel_usecs > 1 ? '+' : ' ');
621 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq;
623
624 if (in_ns) {
625 abs_ts = ns2usecs(abs_ts);
626 rel_ts = ns2usecs(rel_ts);
627 }
628
629 if (verbose && in_ns) {
630 unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
631 unsigned long abs_msec = (unsigned long)abs_ts;
632 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
633 unsigned long rel_msec = (unsigned long)rel_ts;
634
635 return trace_seq_printf(
636 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
637 ns2usecs(iter->ts),
638 abs_msec, abs_usec,
639 rel_msec, rel_usec);
640 } else if (verbose && !in_ns) {
641 return trace_seq_printf(
642 s, "[%016llx] %lld (+%lld): ",
643 iter->ts, abs_ts, rel_ts);
644 } else if (!verbose && in_ns) {
645 return trace_seq_printf(
646 s, " %4lldus%c: ",
647 abs_ts,
648 rel_ts > preempt_mark_thresh_us ? '!' :
649 rel_ts > 1 ? '+' : ' ');
650 } else { /* !verbose && !in_ns */
651 return trace_seq_printf(s, " %4lld: ", abs_ts);
652 }
653} 620}
654 621
655int trace_print_context(struct trace_iterator *iter) 622int trace_print_context(struct trace_iterator *iter)
656{ 623{
657 struct trace_seq *s = &iter->seq; 624 struct trace_seq *s = &iter->seq;
658 struct trace_entry *entry = iter->ent; 625 struct trace_entry *entry = iter->ent;
659 unsigned long long t; 626 unsigned long long t = ns2usecs(iter->ts);
660 unsigned long secs, usec_rem; 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t;
661 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
662 int ret;
663 630
664 trace_find_cmdline(entry->pid, comm); 631 trace_find_cmdline(entry->pid, comm);
665 632
666 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", 633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
667 comm, entry->pid, iter->cpu); 634 comm, entry->pid, iter->cpu, secs, usec_rem);
668 if (!ret)
669 return 0;
670
671 if (trace_flags & TRACE_ITER_IRQ_INFO) {
672 ret = trace_print_lat_fmt(s, entry);
673 if (!ret)
674 return 0;
675 }
676
677 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
678 t = ns2usecs(iter->ts);
679 usec_rem = do_div(t, USEC_PER_SEC);
680 secs = (unsigned long)t;
681 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
682 } else
683 return trace_seq_printf(s, " %12llu: ", iter->ts);
684} 635}
685 636
686int trace_print_lat_context(struct trace_iterator *iter) 637int trace_print_lat_context(struct trace_iterator *iter)
687{ 638{
688 u64 next_ts; 639 u64 next_ts;
689 int ret; 640 int ret;
690 /* trace_find_next_entry will reset ent_size */
691 int ent_size = iter->ent_size;
692 struct trace_seq *s = &iter->seq; 641 struct trace_seq *s = &iter->seq;
693 struct trace_entry *entry = iter->ent, 642 struct trace_entry *entry = iter->ent,
694 *next_entry = trace_find_next_entry(iter, NULL, 643 *next_entry = trace_find_next_entry(iter, NULL,
695 &next_ts); 644 &next_ts);
696 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 645 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
697 646 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
698 /* Restore the original ent_size */ 647 unsigned long rel_usecs;
699 iter->ent_size = ent_size;
700 648
701 if (!next_entry) 649 if (!next_entry)
702 next_ts = iter->ts; 650 next_ts = iter->ts;
651 rel_usecs = ns2usecs(next_ts - iter->ts);
703 652
704 if (verbose) { 653 if (verbose) {
705 char comm[TASK_COMM_LEN]; 654 char comm[TASK_COMM_LEN];
706 655
707 trace_find_cmdline(entry->pid, comm); 656 trace_find_cmdline(entry->pid, comm);
708 657
709 ret = trace_seq_printf( 658 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
710 s, "%16s %5d %3d %d %08x %08lx ", 659 " %ld.%03ldms (+%ld.%03ldms): ", comm,
711 comm, entry->pid, iter->cpu, entry->flags, 660 entry->pid, iter->cpu, entry->flags,
712 entry->preempt_count, iter->idx); 661 entry->preempt_count, iter->idx,
662 ns2usecs(iter->ts),
663 abs_usecs / USEC_PER_MSEC,
664 abs_usecs % USEC_PER_MSEC,
665 rel_usecs / USEC_PER_MSEC,
666 rel_usecs % USEC_PER_MSEC);
713 } else { 667 } else {
714 ret = lat_print_generic(s, entry, iter->cpu); 668 ret = lat_print_generic(s, entry, iter->cpu);
669 if (ret)
670 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
715 } 671 }
716 672
717 if (ret)
718 ret = lat_print_timestamp(iter, next_ts);
719
720 return ret; 673 return ret;
721} 674}
722 675
@@ -1353,4 +1306,4 @@ __init static int init_events(void)
1353 1306
1354 return 0; 1307 return 0;
1355} 1308}
1356early_initcall(init_events); 1309device_initcall(init_events);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad..1f06468a10d 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,10 +51,6 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
51 const char **iter; 51 const char **iter;
52 char *fmt; 52 char *fmt;
53 53
54 /* allocate the trace_printk per cpu buffers */
55 if (start != end)
56 trace_printk_init_buffers();
57
58 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
59 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
60 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); 56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
@@ -63,19 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
63 continue; 59 continue;
64 } 60 }
65 61
66 fmt = NULL;
67 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
68 if (tb_fmt) { 63 if (tb_fmt)
69 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
70 if (fmt) { 65 if (tb_fmt && fmt) {
71 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
72 strcpy(fmt, *iter); 67 strcpy(fmt, *iter);
73 tb_fmt->fmt = fmt; 68 tb_fmt->fmt = fmt;
74 } else 69 *iter = tb_fmt->fmt;
75 kfree(tb_fmt); 70 } else {
71 kfree(tb_fmt);
72 *iter = NULL;
76 } 73 }
77 *iter = fmt;
78
79 } 74 }
80 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
81} 76}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
deleted file mode 100644
index 412e959709b..00000000000
--- a/kernel/trace/trace_probe.c
+++ /dev/null
@@ -1,839 +0,0 @@
1/*
2 * Common code for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.c written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include "trace_probe.h"
26
27const char *reserved_field_names[] = {
28 "common_type",
29 "common_flags",
30 "common_preempt_count",
31 "common_pid",
32 "common_tgid",
33 FIELD_STRING_IP,
34 FIELD_STRING_RETIP,
35 FIELD_STRING_FUNC,
36};
37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \
46 void *data, void *ent)\
47{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
49} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65
66/* For data_loc conversion */
67static inline void *get_loc_data(u32 *dl, void *ent)
68{
69 return (u8 *)ent + get_rloc_offs(*dl);
70}
71
72/* For defining macros, define string/string_size types */
73typedef u32 string;
74typedef u32 string_size;
75
76/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name,
79 void *data, void *ent)
80{
81 int len = *(u32 *)data >> 16;
82
83 if (!len)
84 return trace_seq_printf(s, " %s=(fault)", name);
85 else
86 return trace_seq_printf(s, " %s=\"%s\"", name,
87 (const char *)get_loc_data(data, ent));
88}
89
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102
103#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \
105 (FETCH_FUNC_NAME(method, u16) == fn) || \
106 (FETCH_FUNC_NAME(method, u32) == fn) || \
107 (FETCH_FUNC_NAME(method, u64) == fn) || \
108 (FETCH_FUNC_NAME(method, string) == fn) || \
109 (FETCH_FUNC_NAME(method, string_size) == fn)) \
110 && (fn != NULL))
111
112/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \
116{ \
117 *(type *)dest = (type)regs_get_register(regs, \
118 (unsigned int)((unsigned long)offset)); \
119}
120DEFINE_BASIC_FETCH_FUNCS(reg)
121/* No string on the register */
122#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL
124
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
139 void *dummy, void *dest) \
140{ \
141 *(type *)dest = (type)regs_return_value(regs); \
142}
143DEFINE_BASIC_FETCH_FUNCS(retval)
144/* No string on the retval */
145#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL
147
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */
285struct deref_fetch_param {
286 struct fetch_param orig;
287 long offset;
288};
289
290#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
292 void *data, void *dest) \
293{ \
294 struct deref_fetch_param *dprm = data; \
295 unsigned long addr; \
296 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \
298 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \
300 } else \
301 *(type *)dest = 0; \
302}
303DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size)
306
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{
309 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
310 update_deref_fetch_param(data->orig.data);
311 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
312 update_symbol_cache(data->orig.data);
313}
314
315static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
316{
317 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
318 free_deref_fetch_param(data->orig.data);
319 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
320 free_symbol_cache(data->orig.data);
321 kfree(data);
322}
323
324/* Bitfield fetch function */
325struct bitfield_fetch_param {
326 struct fetch_param orig;
327 unsigned char hi_shift;
328 unsigned char low_shift;
329};
330
331#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
333 void *data, void *dest) \
334{ \
335 struct bitfield_fetch_param *bprm = data; \
336 type buf = 0; \
337 call_fetch(&bprm->orig, regs, &buf); \
338 if (buf) { \
339 buf <<= bprm->hi_shift; \
340 buf >>= bprm->low_shift; \
341 } \
342 *(type *)dest = buf; \
343}
344
345DEFINE_BASIC_FETCH_FUNCS(bitfield)
346#define fetch_bitfield_string NULL
347#define fetch_bitfield_string_size NULL
348
349static __kprobes void
350update_bitfield_fetch_param(struct bitfield_fetch_param *data)
351{
352 /*
353 * Don't check the bitfield itself, because this must be the
354 * last fetch function.
355 */
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 update_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 update_symbol_cache(data->orig.data);
360}
361
362static __kprobes void
363free_bitfield_fetch_param(struct bitfield_fetch_param *data)
364{
365 /*
366 * Don't check the bitfield itself, because this must be the
367 * last fetch function.
368 */
369 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
370 free_deref_fetch_param(data->orig.data);
371 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
372 free_symbol_cache(data->orig.data);
373
374 kfree(data);
375}
376
377/* Default (unsigned long) fetch type */
378#define __DEFAULT_FETCH_TYPE(t) u##t
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{
430 int i;
431
432 if (!type)
433 type = DEFAULT_FETCH_TYPE_STR;
434
435 /* Special case: bitfield */
436 if (*type == 'b') {
437 unsigned long bs;
438
439 type = strchr(type, '/');
440 if (!type)
441 goto fail;
442
443 type++;
444 if (kstrtoul(type, 0, &bs))
445 goto fail;
446
447 switch (bs) {
448 case 8:
449 return find_fetch_type("u8");
450 case 16:
451 return find_fetch_type("u16");
452 case 32:
453 return find_fetch_type("u32");
454 case 64:
455 return find_fetch_type("u64");
456 default:
457 goto fail;
458 }
459 }
460
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
462 if (strcmp(type, fetch_type_table[i].name) == 0)
463 return &fetch_type_table[i];
464
465fail:
466 return NULL;
467}
468
469/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest)
472{
473 *(unsigned long *)dest = kernel_stack_pointer(regs);
474}
475
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn)
478{
479 int i;
480
481 if (type != &fetch_type_table[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */
483
484 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
487
488 WARN_ON(1); /* This should not happen */
489
490 return NULL;
491}
492
493/* Split symbol and offset. */
494int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
495{
496 char *tmp;
497 int ret;
498
499 if (!offset)
500 return -EINVAL;
501
502 tmp = strchr(symbol, '+');
503 if (tmp) {
504 /* skip sign because kstrtoul doesn't accept '+' */
505 ret = kstrtoul(tmp + 1, 0, offset);
506 if (ret)
507 return ret;
508
509 *tmp = '\0';
510 } else
511 *offset = 0;
512
513 return 0;
514}
515
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517
518static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return)
520{
521 int ret = 0;
522 unsigned long param;
523
524 if (strcmp(arg, "retval") == 0) {
525 if (is_return)
526 f->fn = t->fetch[FETCH_MTD_retval];
527 else
528 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
532 f->fn = fetch_stack_address;
533 else
534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) {
536 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL;
539 else {
540 f->fn = t->fetch[FETCH_MTD_stack];
541 f->data = (void *)param;
542 }
543 } else
544 ret = -EINVAL;
545 } else
546 ret = -EINVAL;
547
548 return ret;
549}
550
551/* Recursive argument parser */
552static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe)
554{
555 unsigned long param;
556 long offset;
557 char *tmp;
558 int ret;
559
560 ret = 0;
561
562 /* Until uprobe_events supports only reg arguments */
563 if (!is_kprobe && arg[0] != '%')
564 return -EINVAL;
565
566 switch (arg[0]) {
567 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return);
569 break;
570
571 case '%': /* named register */
572 ret = regs_query_register_offset(arg + 1);
573 if (ret >= 0) {
574 f->fn = t->fetch[FETCH_MTD_reg];
575 f->data = (void *)(unsigned long)ret;
576 ret = 0;
577 }
578 break;
579
580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) {
582 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret)
584 break;
585
586 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param;
588 } else {
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret)
591 break;
592
593 f->data = alloc_symbol_cache(arg + 1, offset);
594 if (f->data)
595 f->fn = t->fetch[FETCH_MTD_symbol];
596 }
597 break;
598
599 case '+': /* deref memory */
600 arg++; /* Skip '+', because kstrtol() rejects it. */
601 case '-':
602 tmp = strchr(arg, '(');
603 if (!tmp)
604 break;
605
606 *tmp = '\0';
607 ret = kstrtol(arg, 0, &offset);
608
609 if (ret)
610 break;
611
612 arg = tmp + 1;
613 tmp = strrchr(arg, ')');
614
615 if (tmp) {
616 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2;
618
619 t2 = find_fetch_type(NULL);
620 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622
623 if (!dprm)
624 return -ENOMEM;
625
626 dprm->offset = offset;
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe);
629 if (ret)
630 kfree(dprm);
631 else {
632 f->fn = t->fetch[FETCH_MTD_deref];
633 f->data = (void *)dprm;
634 }
635 }
636 break;
637 }
638 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
639 pr_info("%s type has no corresponding fetch method.\n", t->name);
640 ret = -EINVAL;
641 }
642
643 return ret;
644}
645
646#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
647
648/* Bitfield type needs to be parsed into a fetch function */
649static int __parse_bitfield_probe_arg(const char *bf,
650 const struct fetch_type *t,
651 struct fetch_param *f)
652{
653 struct bitfield_fetch_param *bprm;
654 unsigned long bw, bo;
655 char *tail;
656
657 if (*bf != 'b')
658 return 0;
659
660 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
661 if (!bprm)
662 return -ENOMEM;
663
664 bprm->orig = *f;
665 f->fn = t->fetch[FETCH_MTD_bitfield];
666 f->data = (void *)bprm;
667 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
668
669 if (bw == 0 || *tail != '@')
670 return -EINVAL;
671
672 bf = tail + 1;
673 bo = simple_strtoul(bf, &tail, 0);
674
675 if (tail == bf || *tail != '/')
676 return -EINVAL;
677
678 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
679 bprm->low_shift = bprm->hi_shift + bo;
680
681 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
682}
683
684/* String length checking wrapper */
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{
688 const char *t;
689 int ret;
690
691 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC;
694 }
695 parg->comm = kstrdup(arg, GFP_KERNEL);
696 if (!parg->comm) {
697 pr_info("Failed to allocate memory for command '%s'.\n", arg);
698 return -ENOMEM;
699 }
700 t = strchr(parg->comm, ':');
701 if (t) {
702 arg[t - parg->comm] = '\0';
703 t++;
704 }
705 parg->type = find_fetch_type(t);
706 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL;
709 }
710 parg->offset = *size;
711 *size += parg->type->size;
712 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
713
714 if (ret >= 0 && t != NULL)
715 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
716
717 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn);
720 parg->fetch_size.data = parg->fetch.data;
721 }
722
723 return ret;
724}
725
726/* Return 1 if name is reserved or already used by another argument */
727int traceprobe_conflict_field_name(const char *name,
728 struct probe_arg *args, int narg)
729{
730 int i;
731
732 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
733 if (strcmp(reserved_field_names[i], name) == 0)
734 return 1;
735
736 for (i = 0; i < narg; i++)
737 if (strcmp(args[i].name, name) == 0)
738 return 1;
739
740 return 0;
741}
742
743void traceprobe_update_arg(struct probe_arg *arg)
744{
745 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
746 update_bitfield_fetch_param(arg->fetch.data);
747 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
748 update_deref_fetch_param(arg->fetch.data);
749 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
750 update_symbol_cache(arg->fetch.data);
751}
752
753void traceprobe_free_probe_arg(struct probe_arg *arg)
754{
755 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
756 free_bitfield_fetch_param(arg->fetch.data);
757 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
758 free_deref_fetch_param(arg->fetch.data);
759 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
760 free_symbol_cache(arg->fetch.data);
761
762 kfree(arg->name);
763 kfree(arg->comm);
764}
765
766int traceprobe_command(const char *buf, int (*createfn)(int, char **))
767{
768 char **argv;
769 int argc, ret;
770
771 argc = 0;
772 ret = 0;
773 argv = argv_split(GFP_KERNEL, buf, &argc);
774 if (!argv)
775 return -ENOMEM;
776
777 if (argc)
778 ret = createfn(argc, argv);
779
780 argv_free(argv);
781
782 return ret;
783}
784
785#define WRITE_BUFSIZE 4096
786
787ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
788 size_t count, loff_t *ppos,
789 int (*createfn)(int, char **))
790{
791 char *kbuf, *tmp;
792 int ret = 0;
793 size_t done = 0;
794 size_t size;
795
796 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
797 if (!kbuf)
798 return -ENOMEM;
799
800 while (done < count) {
801 size = count - done;
802
803 if (size >= WRITE_BUFSIZE)
804 size = WRITE_BUFSIZE - 1;
805
806 if (copy_from_user(kbuf, buffer + done, size)) {
807 ret = -EFAULT;
808 goto out;
809 }
810 kbuf[size] = '\0';
811 tmp = strchr(kbuf, '\n');
812
813 if (tmp) {
814 *tmp = '\0';
815 size = tmp - kbuf + 1;
816 } else if (done + size < count) {
817 pr_warning("Line length is too long: "
818 "Should be less than %d.", WRITE_BUFSIZE);
819 ret = -EINVAL;
820 goto out;
821 }
822 done += size;
823 /* Remove comments */
824 tmp = strchr(kbuf, '#');
825
826 if (tmp)
827 *tmp = '\0';
828
829 ret = traceprobe_command(kbuf, createfn);
830 if (ret)
831 goto out;
832 }
833 ret = done;
834
835out:
836 kfree(kbuf);
837
838 return ret;
839}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
deleted file mode 100644
index 93370867781..00000000000
--- a/kernel/trace/trace_probe.h
+++ /dev/null
@@ -1,161 +0,0 @@
1/*
2 * Common header file for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.h written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include <linux/seq_file.h>
26#include <linux/slab.h>
27#include <linux/smp.h>
28#include <linux/debugfs.h>
29#include <linux/types.h>
30#include <linux/string.h>
31#include <linux/ctype.h>
32#include <linux/ptrace.h>
33#include <linux/perf_event.h>
34#include <linux/kprobes.h>
35#include <linux/stringify.h>
36#include <linux/limits.h>
37#include <linux/uaccess.h>
38#include <asm/bitsperlong.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define MAX_TRACE_ARGS 128
44#define MAX_ARGSTR_LEN 63
45#define MAX_EVENT_NAME_LEN 64
46#define MAX_STRING_SIZE PATH_MAX
47
48/* Reserved field names */
49#define FIELD_STRING_IP "__probe_ip"
50#define FIELD_STRING_RETIP "__probe_ret_ip"
51#define FIELD_STRING_FUNC "__probe_func"
52
53#undef DEFINE_FIELD
54#define DEFINE_FIELD(type, item, name, is_signed) \
55 do { \
56 ret = trace_define_field(event_call, #type, name, \
57 offsetof(typeof(field), item), \
58 sizeof(field.item), is_signed, \
59 FILTER_OTHER); \
60 if (ret) \
61 return ret; \
62 } while (0)
63
64
65/* Flags for trace_probe */
66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70
71
72/* data_rloc: data relative location, compatible with u32 */
73#define make_data_rloc(len, roffs) \
74 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
75#define get_rloc_len(dl) ((u32)(dl) >> 16)
76#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
77
78/*
79 * Convert data_rloc to data_loc:
80 * data_rloc stores the offset from data_rloc itself, but data_loc
81 * stores the offset from event entry.
82 */
83#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
84
85/* Data fetch function type */
86typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
87/* Printing function type */
88typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
89
90/* Fetch types */
91enum {
92 FETCH_MTD_reg = 0,
93 FETCH_MTD_stack,
94 FETCH_MTD_retval,
95 FETCH_MTD_memory,
96 FETCH_MTD_symbol,
97 FETCH_MTD_deref,
98 FETCH_MTD_bitfield,
99 FETCH_MTD_END,
100};
101
102/* Fetch type information table */
103struct fetch_type {
104 const char *name; /* Name of type */
105 size_t size; /* Byte size of type */
106 int is_signed; /* Signed flag */
107 print_type_func_t print; /* Print functions */
108 const char *fmt; /* Fromat string */
109 const char *fmttype; /* Name in format file */
110 /* Fetch functions */
111 fetch_func_t fetch[FETCH_MTD_END];
112};
113
114struct fetch_param {
115 fetch_func_t fn;
116 void *data;
117};
118
119struct probe_arg {
120 struct fetch_param fetch;
121 struct fetch_param fetch_size;
122 unsigned int offset; /* Offset from argument entry */
123 const char *name; /* Name of this argument */
124 const char *comm; /* Command of this argument */
125 const struct fetch_type *type; /* Type of this argument */
126};
127
128static inline __kprobes void call_fetch(struct fetch_param *fprm,
129 struct pt_regs *regs, void *dest)
130{
131 return fprm->fn(regs, fprm->data, dest);
132}
133
134/* Check the name is good for event/group/fields */
135static inline int is_good_name(const char *name)
136{
137 if (!isalpha(*name) && *name != '_')
138 return 0;
139 while (*++name != '\0') {
140 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
141 return 0;
142 }
143 return 1;
144}
145
146extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
147 struct probe_arg *parg, bool is_return, bool is_kprobe);
148
149extern int traceprobe_conflict_field_name(const char *name,
150 struct probe_arg *args, int narg);
151
152extern void traceprobe_update_arg(struct probe_arg *arg);
153extern void traceprobe_free_probe_arg(struct probe_arg *arg);
154
155extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
156
157extern ssize_t traceprobe_probes_write(struct file *file,
158 const char __user *buffer, size_t count, loff_t *ppos,
159 int (*createfn)(int, char**));
160
161extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd..7e62c0a1845 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,7 +102,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc); 105 ring_buffer_unlock_commit(buffer, event);
106 ftrace_trace_stack(tr->buffer, flags, 6, pc);
107 ftrace_trace_userstack(tr->buffer, flags, pc);
106} 108}
107 109
108static void 110static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca..e4a70c0c71b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
@@ -108,8 +108,7 @@ out_enable:
108 * wakeup uses its own tracer function to keep the overhead down: 108 * wakeup uses its own tracer function to keep the overhead down:
109 */ 109 */
110static void 110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, 111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112 struct ftrace_ops *op, struct pt_regs *pt_regs)
113{ 112{
114 struct trace_array *tr = wakeup_trace; 113 struct trace_array *tr = wakeup_trace;
115 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
@@ -130,7 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
130static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
131{ 130{
132 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
133 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 132 .flags = FTRACE_OPS_FL_GLOBAL,
134}; 133};
135#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
136 135
@@ -281,20 +280,9 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
281} 280}
282 281
283static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 284static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 285static void wakeup_trace_close(struct trace_iterator *iter) { }
286
287#ifdef CONFIG_FUNCTION_TRACER
288static void wakeup_print_header(struct seq_file *s)
289{
290 trace_default_header(s);
291}
292#else
293static void wakeup_print_header(struct seq_file *s)
294{
295 trace_latency_header(s);
296}
297#endif /* CONFIG_FUNCTION_TRACER */
298#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
299 287
300/* 288/*
@@ -589,7 +577,7 @@ static struct tracer wakeup_tracer __read_mostly =
589 .reset = wakeup_tracer_reset, 577 .reset = wakeup_tracer_reset,
590 .start = wakeup_tracer_start, 578 .start = wakeup_tracer_start,
591 .stop = wakeup_tracer_stop, 579 .stop = wakeup_tracer_stop,
592 .print_max = true, 580 .print_max = 1,
593 .print_header = wakeup_print_header, 581 .print_header = wakeup_print_header,
594 .print_line = wakeup_print_line, 582 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 583 .flags = &tracer_flags,
@@ -599,7 +587,7 @@ static struct tracer wakeup_tracer __read_mostly =
599#endif 587#endif
600 .open = wakeup_trace_open, 588 .open = wakeup_trace_open,
601 .close = wakeup_trace_close, 589 .close = wakeup_trace_close,
602 .use_max_tr = true, 590 .use_max_tr = 1,
603}; 591};
604 592
605static struct tracer wakeup_rt_tracer __read_mostly = 593static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,7 +598,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
610 .start = wakeup_tracer_start, 598 .start = wakeup_tracer_start,
611 .stop = wakeup_tracer_stop, 599 .stop = wakeup_tracer_stop,
612 .wait_pipe = poll_wait_pipe, 600 .wait_pipe = poll_wait_pipe,
613 .print_max = true, 601 .print_max = 1,
614 .print_header = wakeup_print_header, 602 .print_header = wakeup_print_header,
615 .print_line = wakeup_print_line, 603 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 604 .flags = &tracer_flags,
@@ -620,7 +608,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
620#endif 608#endif
621 .open = wakeup_trace_open, 609 .open = wakeup_trace_open,
622 .close = wakeup_trace_close, 610 .close = wakeup_trace_close,
623 .use_max_tr = true, 611 .use_max_tr = 1,
624}; 612};
625 613
626__init static int init_wakeup_tracer(void) 614__init static int init_wakeup_tracer(void)
@@ -637,4 +625,4 @@ __init static int init_wakeup_tracer(void)
637 625
638 return 0; 626 return 0;
639} 627}
640core_initcall(init_wakeup_tracer); 628device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a81..288541f977f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -103,67 +103,54 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
103 103
104static int trace_selftest_test_probe1_cnt; 104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip, 105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip, 106 unsigned long pip)
107 struct ftrace_ops *op,
108 struct pt_regs *pt_regs)
109{ 107{
110 trace_selftest_test_probe1_cnt++; 108 trace_selftest_test_probe1_cnt++;
111} 109}
112 110
113static int trace_selftest_test_probe2_cnt; 111static int trace_selftest_test_probe2_cnt;
114static void trace_selftest_test_probe2_func(unsigned long ip, 112static void trace_selftest_test_probe2_func(unsigned long ip,
115 unsigned long pip, 113 unsigned long pip)
116 struct ftrace_ops *op,
117 struct pt_regs *pt_regs)
118{ 114{
119 trace_selftest_test_probe2_cnt++; 115 trace_selftest_test_probe2_cnt++;
120} 116}
121 117
122static int trace_selftest_test_probe3_cnt; 118static int trace_selftest_test_probe3_cnt;
123static void trace_selftest_test_probe3_func(unsigned long ip, 119static void trace_selftest_test_probe3_func(unsigned long ip,
124 unsigned long pip, 120 unsigned long pip)
125 struct ftrace_ops *op,
126 struct pt_regs *pt_regs)
127{ 121{
128 trace_selftest_test_probe3_cnt++; 122 trace_selftest_test_probe3_cnt++;
129} 123}
130 124
131static int trace_selftest_test_global_cnt; 125static int trace_selftest_test_global_cnt;
132static void trace_selftest_test_global_func(unsigned long ip, 126static void trace_selftest_test_global_func(unsigned long ip,
133 unsigned long pip, 127 unsigned long pip)
134 struct ftrace_ops *op,
135 struct pt_regs *pt_regs)
136{ 128{
137 trace_selftest_test_global_cnt++; 129 trace_selftest_test_global_cnt++;
138} 130}
139 131
140static int trace_selftest_test_dyn_cnt; 132static int trace_selftest_test_dyn_cnt;
141static void trace_selftest_test_dyn_func(unsigned long ip, 133static void trace_selftest_test_dyn_func(unsigned long ip,
142 unsigned long pip, 134 unsigned long pip)
143 struct ftrace_ops *op,
144 struct pt_regs *pt_regs)
145{ 135{
146 trace_selftest_test_dyn_cnt++; 136 trace_selftest_test_dyn_cnt++;
147} 137}
148 138
149static struct ftrace_ops test_probe1 = { 139static struct ftrace_ops test_probe1 = {
150 .func = trace_selftest_test_probe1_func, 140 .func = trace_selftest_test_probe1_func,
151 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
152}; 141};
153 142
154static struct ftrace_ops test_probe2 = { 143static struct ftrace_ops test_probe2 = {
155 .func = trace_selftest_test_probe2_func, 144 .func = trace_selftest_test_probe2_func,
156 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
157}; 145};
158 146
159static struct ftrace_ops test_probe3 = { 147static struct ftrace_ops test_probe3 = {
160 .func = trace_selftest_test_probe3_func, 148 .func = trace_selftest_test_probe3_func,
161 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
162}; 149};
163 150
164static struct ftrace_ops test_global = { 151static struct ftrace_ops test_global = {
165 .func = trace_selftest_test_global_func, 152 .func = trace_selftest_test_global_func,
166 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 153 .flags = FTRACE_OPS_FL_GLOBAL,
167}; 154};
168 155
169static void print_counts(void) 156static void print_counts(void)
@@ -320,6 +307,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
320 int (*func)(void)) 307 int (*func)(void))
321{ 308{
322 int save_ftrace_enabled = ftrace_enabled; 309 int save_ftrace_enabled = ftrace_enabled;
310 int save_tracer_enabled = tracer_enabled;
323 unsigned long count; 311 unsigned long count;
324 char *func_name; 312 char *func_name;
325 int ret; 313 int ret;
@@ -330,6 +318,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
330 318
331 /* enable tracing, and record the filter function */ 319 /* enable tracing, and record the filter function */
332 ftrace_enabled = 1; 320 ftrace_enabled = 1;
321 tracer_enabled = 1;
333 322
334 /* passed in by parameter to fool gcc from optimizing */ 323 /* passed in by parameter to fool gcc from optimizing */
335 func(); 324 func();
@@ -393,6 +382,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
393 382
394 out: 383 out:
395 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
385 tracer_enabled = save_tracer_enabled;
396 386
397 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
398 ftrace_set_global_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
@@ -403,247 +393,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
403 393
404 return ret; 394 return ret;
405} 395}
406
407static int trace_selftest_recursion_cnt;
408static void trace_selftest_test_recursion_func(unsigned long ip,
409 unsigned long pip,
410 struct ftrace_ops *op,
411 struct pt_regs *pt_regs)
412{
413 /*
414 * This function is registered without the recursion safe flag.
415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel!
417 */
418 trace_selftest_recursion_cnt++;
419 DYN_FTRACE_TEST_NAME();
420}
421
422static void trace_selftest_test_recursion_safe_func(unsigned long ip,
423 unsigned long pip,
424 struct ftrace_ops *op,
425 struct pt_regs *pt_regs)
426{
427 /*
428 * We said we would provide our own recursion. By calling
429 * this function again, we should recurse back into this function
430 * and count again. But this only happens if the arch supports
431 * all of ftrace features and nothing else is using the function
432 * tracing utility.
433 */
434 if (trace_selftest_recursion_cnt++)
435 return;
436 DYN_FTRACE_TEST_NAME();
437}
438
439static struct ftrace_ops test_rec_probe = {
440 .func = trace_selftest_test_recursion_func,
441};
442
443static struct ftrace_ops test_recsafe_probe = {
444 .func = trace_selftest_test_recursion_safe_func,
445 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
446};
447
448static int
449trace_selftest_function_recursion(void)
450{
451 int save_ftrace_enabled = ftrace_enabled;
452 char *func_name;
453 int len;
454 int ret;
455 int cnt;
456
457 /* The previous test PASSED */
458 pr_cont("PASSED\n");
459 pr_info("Testing ftrace recursion: ");
460
461
462 /* enable tracing, and record the filter function */
463 ftrace_enabled = 1;
464
465 /* Handle PPC64 '.' name */
466 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
467 len = strlen(func_name);
468
469 ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
470 if (ret) {
471 pr_cont("*Could not set filter* ");
472 goto out;
473 }
474
475 ret = register_ftrace_function(&test_rec_probe);
476 if (ret) {
477 pr_cont("*could not register callback* ");
478 goto out;
479 }
480
481 DYN_FTRACE_TEST_NAME();
482
483 unregister_ftrace_function(&test_rec_probe);
484
485 ret = -1;
486 if (trace_selftest_recursion_cnt != 1) {
487 pr_cont("*callback not called once (%d)* ",
488 trace_selftest_recursion_cnt);
489 goto out;
490 }
491
492 trace_selftest_recursion_cnt = 1;
493
494 pr_cont("PASSED\n");
495 pr_info("Testing ftrace recursion safe: ");
496
497 ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
498 if (ret) {
499 pr_cont("*Could not set filter* ");
500 goto out;
501 }
502
503 ret = register_ftrace_function(&test_recsafe_probe);
504 if (ret) {
505 pr_cont("*could not register callback* ");
506 goto out;
507 }
508
509 DYN_FTRACE_TEST_NAME();
510
511 unregister_ftrace_function(&test_recsafe_probe);
512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) {
524 pr_cont("*callback not called expected %d times (%d)* ",
525 cnt, trace_selftest_recursion_cnt);
526 goto out;
527 }
528
529 ret = 0;
530out:
531 ftrace_enabled = save_ftrace_enabled;
532
533 return ret;
534}
535#else 396#else
536# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) 397# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
537# define trace_selftest_function_recursion() ({ 0; })
538#endif /* CONFIG_DYNAMIC_FTRACE */ 398#endif /* CONFIG_DYNAMIC_FTRACE */
539 399
540static enum {
541 TRACE_SELFTEST_REGS_START,
542 TRACE_SELFTEST_REGS_FOUND,
543 TRACE_SELFTEST_REGS_NOT_FOUND,
544} trace_selftest_regs_stat;
545
546static void trace_selftest_test_regs_func(unsigned long ip,
547 unsigned long pip,
548 struct ftrace_ops *op,
549 struct pt_regs *pt_regs)
550{
551 if (pt_regs)
552 trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
553 else
554 trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
555}
556
557static struct ftrace_ops test_regs_probe = {
558 .func = trace_selftest_test_regs_func,
559 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
560};
561
562static int
563trace_selftest_function_regs(void)
564{
565 int save_ftrace_enabled = ftrace_enabled;
566 char *func_name;
567 int len;
568 int ret;
569 int supported = 0;
570
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
572 supported = 1;
573#endif
574
575 /* The previous test PASSED */
576 pr_cont("PASSED\n");
577 pr_info("Testing ftrace regs%s: ",
578 !supported ? "(no arch support)" : "");
579
580 /* enable tracing, and record the filter function */
581 ftrace_enabled = 1;
582
583 /* Handle PPC64 '.' name */
584 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
585 len = strlen(func_name);
586
587 ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
588 /*
589 * If DYNAMIC_FTRACE is not set, then we just trace all functions.
590 * This test really doesn't care.
591 */
592 if (ret && ret != -ENODEV) {
593 pr_cont("*Could not set filter* ");
594 goto out;
595 }
596
597 ret = register_ftrace_function(&test_regs_probe);
598 /*
599 * Now if the arch does not support passing regs, then this should
600 * have failed.
601 */
602 if (!supported) {
603 if (!ret) {
604 pr_cont("*registered save-regs without arch support* ");
605 goto out;
606 }
607 test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
608 ret = register_ftrace_function(&test_regs_probe);
609 }
610 if (ret) {
611 pr_cont("*could not register callback* ");
612 goto out;
613 }
614
615
616 DYN_FTRACE_TEST_NAME();
617
618 unregister_ftrace_function(&test_regs_probe);
619
620 ret = -1;
621
622 switch (trace_selftest_regs_stat) {
623 case TRACE_SELFTEST_REGS_START:
624 pr_cont("*callback never called* ");
625 goto out;
626
627 case TRACE_SELFTEST_REGS_FOUND:
628 if (supported)
629 break;
630 pr_cont("*callback received regs without arch support* ");
631 goto out;
632
633 case TRACE_SELFTEST_REGS_NOT_FOUND:
634 if (!supported)
635 break;
636 pr_cont("*callback received NULL regs* ");
637 goto out;
638 }
639
640 ret = 0;
641out:
642 ftrace_enabled = save_ftrace_enabled;
643
644 return ret;
645}
646
647/* 400/*
648 * Simple verification test of ftrace function tracer. 401 * Simple verification test of ftrace function tracer.
649 * Enable ftrace, sleep 1/10 second, and then read the trace 402 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -653,6 +406,7 @@ int
653trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 406trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
654{ 407{
655 int save_ftrace_enabled = ftrace_enabled; 408 int save_ftrace_enabled = ftrace_enabled;
409 int save_tracer_enabled = tracer_enabled;
656 unsigned long count; 410 unsigned long count;
657 int ret; 411 int ret;
658 412
@@ -661,6 +415,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
661 415
662 /* start the tracing */ 416 /* start the tracing */
663 ftrace_enabled = 1; 417 ftrace_enabled = 1;
418 tracer_enabled = 1;
664 419
665 ret = tracer_init(trace, tr); 420 ret = tracer_init(trace, tr);
666 if (ret) { 421 if (ret) {
@@ -687,16 +442,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
687 442
688 ret = trace_selftest_startup_dynamic_tracing(trace, tr, 443 ret = trace_selftest_startup_dynamic_tracing(trace, tr,
689 DYN_FTRACE_TEST_NAME); 444 DYN_FTRACE_TEST_NAME);
690 if (ret)
691 goto out;
692
693 ret = trace_selftest_function_recursion();
694 if (ret)
695 goto out;
696 445
697 ret = trace_selftest_function_regs();
698 out: 446 out:
699 ftrace_enabled = save_ftrace_enabled; 447 ftrace_enabled = save_ftrace_enabled;
448 tracer_enabled = save_tracer_enabled;
700 449
701 /* kill ftrace totally if we failed */ 450 /* kill ftrace totally if we failed */
702 if (ret) 451 if (ret)
@@ -1029,8 +778,6 @@ static int trace_wakeup_test_thread(void *data)
1029 set_current_state(TASK_INTERRUPTIBLE); 778 set_current_state(TASK_INTERRUPTIBLE);
1030 schedule(); 779 schedule();
1031 780
1032 complete(x);
1033
1034 /* we are awake, now wait to disappear */ 781 /* we are awake, now wait to disappear */
1035 while (!kthread_should_stop()) { 782 while (!kthread_should_stop()) {
1036 /* 783 /*
@@ -1074,27 +821,29 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1074 /* reset the max latency */ 821 /* reset the max latency */
1075 tracing_max_latency = 0; 822 tracing_max_latency = 0;
1076 823
1077 while (p->on_rq) { 824 /* sleep to let the RT thread sleep too */
1078 /* 825 msleep(100);
1079 * Sleep to make sure the RT thread is asleep too.
1080 * On virtual machines we can't rely on timings,
1081 * but we want to make sure this test still works.
1082 */
1083 msleep(100);
1084 }
1085 826
1086 init_completion(&isrt); 827 /*
828 * Yes this is slightly racy. It is possible that for some
829 * strange reason that the RT thread we created, did not
830 * call schedule for 100ms after doing the completion,
831 * and we do a wakeup on a task that already is awake.
832 * But that is extremely unlikely, and the worst thing that
833 * happens in such a case, is that we disable tracing.
834 * Honestly, if this race does happen something is horrible
835 * wrong with the system.
836 */
1087 837
1088 wake_up_process(p); 838 wake_up_process(p);
1089 839
1090 /* Wait for the task to wake up */ 840 /* give a little time to let the thread wake up */
1091 wait_for_completion(&isrt); 841 msleep(100);
1092 842
1093 /* stop the tracing. */ 843 /* stop the tracing. */
1094 tracing_stop(); 844 tracing_stop();
1095 /* check both trace buffers */ 845 /* check both trace buffers */
1096 ret = trace_test_buffer(tr, NULL); 846 ret = trace_test_buffer(tr, NULL);
1097 printk("ret = %d\n", ret);
1098 if (!ret) 847 if (!ret)
1099 ret = trace_test_buffer(&max_tr, &count); 848 ret = trace_test_buffer(&max_tr, &count);
1100 849
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc70..77575b386d9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,9 +13,6 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16
17#include <asm/setup.h>
18
19#include "trace.h" 16#include "trace.h"
20 17
21#define STACK_TRACE_ENTRIES 500 18#define STACK_TRACE_ENTRIES 500
@@ -33,6 +30,7 @@ static unsigned long max_stack_size;
33static arch_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
35 32
33static int stack_trace_disabled __read_mostly;
36static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
37static DEFINE_MUTEX(stack_sysctl_mutex); 35static DEFINE_MUTEX(stack_sysctl_mutex);
38 36
@@ -110,11 +108,13 @@ static inline void check_stack(void)
110} 108}
111 109
112static void 110static void
113stack_trace_call(unsigned long ip, unsigned long parent_ip, 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
114 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 112{
116 int cpu; 113 int cpu;
117 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return;
117
118 preempt_disable_notrace(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
@@ -133,7 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 136 .flags = FTRACE_OPS_FL_GLOBAL,
137}; 137};
138 138
139static ssize_t 139static ssize_t
@@ -311,21 +311,6 @@ static const struct file_operations stack_trace_fops = {
311 .release = seq_release, 311 .release = seq_release,
312}; 312};
313 313
314static int
315stack_trace_filter_open(struct inode *inode, struct file *file)
316{
317 return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
318 inode, file);
319}
320
321static const struct file_operations stack_trace_filter_fops = {
322 .open = stack_trace_filter_open,
323 .read = seq_read,
324 .write = ftrace_filter_write,
325 .llseek = ftrace_regex_lseek,
326 .release = ftrace_regex_release,
327};
328
329int 314int
330stack_trace_sysctl(struct ctl_table *table, int write, 315stack_trace_sysctl(struct ctl_table *table, int write,
331 void __user *buffer, size_t *lenp, 316 void __user *buffer, size_t *lenp,
@@ -353,13 +338,8 @@ stack_trace_sysctl(struct ctl_table *table, int write,
353 return ret; 338 return ret;
354} 339}
355 340
356static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
357
358static __init int enable_stacktrace(char *str) 341static __init int enable_stacktrace(char *str)
359{ 342{
360 if (strncmp(str, "_filter=", 8) == 0)
361 strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
362
363 stack_tracer_enabled = 1; 343 stack_tracer_enabled = 1;
364 last_stack_tracer_enabled = 1; 344 last_stack_tracer_enabled = 1;
365 return 1; 345 return 1;
@@ -378,12 +358,6 @@ static __init int stack_trace_init(void)
378 trace_create_file("stack_trace", 0444, d_tracer, 358 trace_create_file("stack_trace", 0444, d_tracer,
379 NULL, &stack_trace_fops); 359 NULL, &stack_trace_fops);
380 360
381 trace_create_file("stack_trace_filter", 0444, d_tracer,
382 NULL, &stack_trace_filter_fops);
383
384 if (stack_trace_filter_buf[0])
385 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
386
387 if (stack_tracer_enabled) 361 if (stack_tracer_enabled)
388 register_ftrace_function(&trace_ops); 362 register_ftrace_function(&trace_ops);
389 363
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c..ee7b5a0bb9f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,6 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
6#include <linux/ftrace.h> 5#include <linux/ftrace.h>
7#include <linux/perf_event.h> 6#include <linux/perf_event.h>
8#include <asm/syscall.h> 7#include <asm/syscall.h>
@@ -17,9 +16,12 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18 17
19static int syscall_enter_register(struct ftrace_event_call *event, 18static int syscall_enter_register(struct ftrace_event_call *event,
20 enum trace_reg type, void *data); 19 enum trace_reg type);
21static int syscall_exit_register(struct ftrace_event_call *event, 20static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type, void *data); 21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
23 25
24static struct list_head * 26static struct list_head *
25syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
@@ -29,6 +31,30 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
29 return &entry->enter_fields; 31 return &entry->enter_fields;
30} 32}
31 33
34struct trace_event_functions enter_syscall_print_funcs = {
35 .trace = print_syscall_enter,
36};
37
38struct trace_event_functions exit_syscall_print_funcs = {
39 .trace = print_syscall_exit,
40};
41
42struct ftrace_event_class event_class_syscall_enter = {
43 .system = "syscalls",
44 .reg = syscall_enter_register,
45 .define_fields = syscall_enter_define_fields,
46 .get_fields = syscall_get_enter_fields,
47 .raw_init = init_syscall_trace,
48};
49
50struct ftrace_event_class event_class_syscall_exit = {
51 .system = "syscalls",
52 .reg = syscall_exit_register,
53 .define_fields = syscall_exit_define_fields,
54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
55 .raw_init = init_syscall_trace,
56};
57
32extern struct syscall_metadata *__start_syscalls_metadata[]; 58extern struct syscall_metadata *__start_syscalls_metadata[];
33extern struct syscall_metadata *__stop_syscalls_metadata[]; 59extern struct syscall_metadata *__stop_syscalls_metadata[];
34 60
@@ -405,7 +431,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
405 mutex_unlock(&syscall_trace_lock); 431 mutex_unlock(&syscall_trace_lock);
406} 432}
407 433
408static int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
409{ 435{
410 int id; 436 int id;
411 int num; 437 int num;
@@ -430,30 +456,6 @@ static int init_syscall_trace(struct ftrace_event_call *call)
430 return id; 456 return id;
431} 457}
432 458
433struct trace_event_functions enter_syscall_print_funcs = {
434 .trace = print_syscall_enter,
435};
436
437struct trace_event_functions exit_syscall_print_funcs = {
438 .trace = print_syscall_exit,
439};
440
441struct ftrace_event_class event_class_syscall_enter = {
442 .system = "syscalls",
443 .reg = syscall_enter_register,
444 .define_fields = syscall_enter_define_fields,
445 .get_fields = syscall_get_enter_fields,
446 .raw_init = init_syscall_trace,
447};
448
449struct ftrace_event_class event_class_syscall_exit = {
450 .system = "syscalls",
451 .reg = syscall_exit_register,
452 .define_fields = syscall_exit_define_fields,
453 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
454 .raw_init = init_syscall_trace,
455};
456
457unsigned long __init __weak arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
458{ 460{
459 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
@@ -465,8 +467,8 @@ int __init init_ftrace_syscalls(void)
465 unsigned long addr; 467 unsigned long addr;
466 int i; 468 int i;
467 469
468 syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), 470 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
469 GFP_KERNEL); 471 NR_syscalls, GFP_KERNEL);
470 if (!syscalls_metadata) { 472 if (!syscalls_metadata) {
471 WARN_ON(1); 473 WARN_ON(1);
472 return -ENOMEM; 474 return -ENOMEM;
@@ -484,7 +486,7 @@ int __init init_ftrace_syscalls(void)
484 486
485 return 0; 487 return 0;
486} 488}
487early_initcall(init_ftrace_syscalls); 489core_initcall(init_ftrace_syscalls);
488 490
489#ifdef CONFIG_PERF_EVENTS 491#ifdef CONFIG_PERF_EVENTS
490 492
@@ -503,8 +505,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
503 int size; 505 int size;
504 506
505 syscall_nr = syscall_get_nr(current, regs); 507 syscall_nr = syscall_get_nr(current, regs);
506 if (syscall_nr < 0)
507 return;
508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
509 return; 509 return;
510 510
@@ -531,10 +531,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
531 (unsigned long *)&rec->args); 531 (unsigned long *)&rec->args);
532 532
533 head = this_cpu_ptr(sys_data->enter_event->perf_events); 533 head = this_cpu_ptr(sys_data->enter_event->perf_events);
534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
535} 535}
536 536
537static int perf_sysenter_enable(struct ftrace_event_call *call) 537int perf_sysenter_enable(struct ftrace_event_call *call)
538{ 538{
539 int ret = 0; 539 int ret = 0;
540 int num; 540 int num;
@@ -555,7 +555,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
555 return ret; 555 return ret;
556} 556}
557 557
558static void perf_sysenter_disable(struct ftrace_event_call *call) 558void perf_sysenter_disable(struct ftrace_event_call *call)
559{ 559{
560 int num; 560 int num;
561 561
@@ -579,8 +579,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
579 int size; 579 int size;
580 580
581 syscall_nr = syscall_get_nr(current, regs); 581 syscall_nr = syscall_get_nr(current, regs);
582 if (syscall_nr < 0)
583 return;
584 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 582 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
585 return; 583 return;
586 584
@@ -609,10 +607,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
609 rec->ret = syscall_get_return_value(current, regs); 607 rec->ret = syscall_get_return_value(current, regs);
610 608
611 head = this_cpu_ptr(sys_data->exit_event->perf_events); 609 head = this_cpu_ptr(sys_data->exit_event->perf_events);
612 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 610 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
613} 611}
614 612
615static int perf_sysexit_enable(struct ftrace_event_call *call) 613int perf_sysexit_enable(struct ftrace_event_call *call)
616{ 614{
617 int ret = 0; 615 int ret = 0;
618 int num; 616 int num;
@@ -633,7 +631,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
633 return ret; 631 return ret;
634} 632}
635 633
636static void perf_sysexit_disable(struct ftrace_event_call *call) 634void perf_sysexit_disable(struct ftrace_event_call *call)
637{ 635{
638 int num; 636 int num;
639 637
@@ -650,7 +648,7 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
650#endif /* CONFIG_PERF_EVENTS */ 648#endif /* CONFIG_PERF_EVENTS */
651 649
652static int syscall_enter_register(struct ftrace_event_call *event, 650static int syscall_enter_register(struct ftrace_event_call *event,
653 enum trace_reg type, void *data) 651 enum trace_reg type)
654{ 652{
655 switch (type) { 653 switch (type) {
656 case TRACE_REG_REGISTER: 654 case TRACE_REG_REGISTER:
@@ -665,18 +663,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
665 case TRACE_REG_PERF_UNREGISTER: 663 case TRACE_REG_PERF_UNREGISTER:
666 perf_sysenter_disable(event); 664 perf_sysenter_disable(event);
667 return 0; 665 return 0;
668 case TRACE_REG_PERF_OPEN:
669 case TRACE_REG_PERF_CLOSE:
670 case TRACE_REG_PERF_ADD:
671 case TRACE_REG_PERF_DEL:
672 return 0;
673#endif 666#endif
674 } 667 }
675 return 0; 668 return 0;
676} 669}
677 670
678static int syscall_exit_register(struct ftrace_event_call *event, 671static int syscall_exit_register(struct ftrace_event_call *event,
679 enum trace_reg type, void *data) 672 enum trace_reg type)
680{ 673{
681 switch (type) { 674 switch (type) {
682 case TRACE_REG_REGISTER: 675 case TRACE_REG_REGISTER:
@@ -691,11 +684,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
691 case TRACE_REG_PERF_UNREGISTER: 684 case TRACE_REG_PERF_UNREGISTER:
692 perf_sysexit_disable(event); 685 perf_sysexit_disable(event);
693 return 0; 686 return 0;
694 case TRACE_REG_PERF_OPEN:
695 case TRACE_REG_PERF_CLOSE:
696 case TRACE_REG_PERF_ADD:
697 case TRACE_REG_PERF_DEL:
698 return 0;
699#endif 687#endif
700 } 688 }
701 return 0; 689 return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
deleted file mode 100644
index c86e6d4f67f..00000000000
--- a/kernel/trace/trace_uprobe.c
+++ /dev/null
@@ -1,788 +0,0 @@
1/*
2 * uprobes-based tracing events
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * Copyright (C) IBM Corporation, 2010-2012
18 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
19 */
20
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/uprobes.h>
24#include <linux/namei.h>
25#include <linux/string.h>
26
27#include "trace_probe.h"
28
29#define UPROBE_EVENT_SYSTEM "uprobes"
30
31/*
32 * uprobe event core functions
33 */
34struct trace_uprobe;
35struct uprobe_trace_consumer {
36 struct uprobe_consumer cons;
37 struct trace_uprobe *tu;
38};
39
40struct trace_uprobe {
41 struct list_head list;
42 struct ftrace_event_class class;
43 struct ftrace_event_call call;
44 struct uprobe_trace_consumer *consumer;
45 struct inode *inode;
46 char *filename;
47 unsigned long offset;
48 unsigned long nhit;
49 unsigned int flags; /* For TP_FLAG_* */
50 ssize_t size; /* trace entry size */
51 unsigned int nr_args;
52 struct probe_arg args[];
53};
54
55#define SIZEOF_TRACE_UPROBE(n) \
56 (offsetof(struct trace_uprobe, args) + \
57 (sizeof(struct probe_arg) * (n)))
58
59static int register_uprobe_event(struct trace_uprobe *tu);
60static void unregister_uprobe_event(struct trace_uprobe *tu);
61
62static DEFINE_MUTEX(uprobe_lock);
63static LIST_HEAD(uprobe_list);
64
65static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
66
67/*
68 * Allocate new trace_uprobe and initialize it (including uprobes).
69 */
70static struct trace_uprobe *
71alloc_trace_uprobe(const char *group, const char *event, int nargs)
72{
73 struct trace_uprobe *tu;
74
75 if (!event || !is_good_name(event))
76 return ERR_PTR(-EINVAL);
77
78 if (!group || !is_good_name(group))
79 return ERR_PTR(-EINVAL);
80
81 tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
82 if (!tu)
83 return ERR_PTR(-ENOMEM);
84
85 tu->call.class = &tu->class;
86 tu->call.name = kstrdup(event, GFP_KERNEL);
87 if (!tu->call.name)
88 goto error;
89
90 tu->class.system = kstrdup(group, GFP_KERNEL);
91 if (!tu->class.system)
92 goto error;
93
94 INIT_LIST_HEAD(&tu->list);
95 return tu;
96
97error:
98 kfree(tu->call.name);
99 kfree(tu);
100
101 return ERR_PTR(-ENOMEM);
102}
103
104static void free_trace_uprobe(struct trace_uprobe *tu)
105{
106 int i;
107
108 for (i = 0; i < tu->nr_args; i++)
109 traceprobe_free_probe_arg(&tu->args[i]);
110
111 iput(tu->inode);
112 kfree(tu->call.class->system);
113 kfree(tu->call.name);
114 kfree(tu->filename);
115 kfree(tu);
116}
117
118static struct trace_uprobe *find_probe_event(const char *event, const char *group)
119{
120 struct trace_uprobe *tu;
121
122 list_for_each_entry(tu, &uprobe_list, list)
123 if (strcmp(tu->call.name, event) == 0 &&
124 strcmp(tu->call.class->system, group) == 0)
125 return tu;
126
127 return NULL;
128}
129
130/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
131static void unregister_trace_uprobe(struct trace_uprobe *tu)
132{
133 list_del(&tu->list);
134 unregister_uprobe_event(tu);
135 free_trace_uprobe(tu);
136}
137
138/* Register a trace_uprobe and probe_event */
139static int register_trace_uprobe(struct trace_uprobe *tu)
140{
141 struct trace_uprobe *old_tp;
142 int ret;
143
144 mutex_lock(&uprobe_lock);
145
146 /* register as an event */
147 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
148 if (old_tp)
149 /* delete old event */
150 unregister_trace_uprobe(old_tp);
151
152 ret = register_uprobe_event(tu);
153 if (ret) {
154 pr_warning("Failed to register probe event(%d)\n", ret);
155 goto end;
156 }
157
158 list_add_tail(&tu->list, &uprobe_list);
159
160end:
161 mutex_unlock(&uprobe_lock);
162
163 return ret;
164}
165
166/*
167 * Argument syntax:
168 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
169 *
170 * - Remove uprobe: -:[GRP/]EVENT
171 */
172static int create_trace_uprobe(int argc, char **argv)
173{
174 struct trace_uprobe *tu;
175 struct inode *inode;
176 char *arg, *event, *group, *filename;
177 char buf[MAX_EVENT_NAME_LEN];
178 struct path path;
179 unsigned long offset;
180 bool is_delete;
181 int i, ret;
182
183 inode = NULL;
184 ret = 0;
185 is_delete = false;
186 event = NULL;
187 group = NULL;
188
189 /* argc must be >= 1 */
190 if (argv[0][0] == '-')
191 is_delete = true;
192 else if (argv[0][0] != 'p') {
193 pr_info("Probe definition must be started with 'p' or '-'.\n");
194 return -EINVAL;
195 }
196
197 if (argv[0][1] == ':') {
198 event = &argv[0][2];
199 arg = strchr(event, '/');
200
201 if (arg) {
202 group = event;
203 event = arg + 1;
204 event[-1] = '\0';
205
206 if (strlen(group) == 0) {
207 pr_info("Group name is not specified\n");
208 return -EINVAL;
209 }
210 }
211 if (strlen(event) == 0) {
212 pr_info("Event name is not specified\n");
213 return -EINVAL;
214 }
215 }
216 if (!group)
217 group = UPROBE_EVENT_SYSTEM;
218
219 if (is_delete) {
220 if (!event) {
221 pr_info("Delete command needs an event name.\n");
222 return -EINVAL;
223 }
224 mutex_lock(&uprobe_lock);
225 tu = find_probe_event(event, group);
226
227 if (!tu) {
228 mutex_unlock(&uprobe_lock);
229 pr_info("Event %s/%s doesn't exist.\n", group, event);
230 return -ENOENT;
231 }
232 /* delete an event */
233 unregister_trace_uprobe(tu);
234 mutex_unlock(&uprobe_lock);
235 return 0;
236 }
237
238 if (argc < 2) {
239 pr_info("Probe point is not specified.\n");
240 return -EINVAL;
241 }
242 if (isdigit(argv[1][0])) {
243 pr_info("probe point must be have a filename.\n");
244 return -EINVAL;
245 }
246 arg = strchr(argv[1], ':');
247 if (!arg)
248 goto fail_address_parse;
249
250 *arg++ = '\0';
251 filename = argv[1];
252 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
253 if (ret)
254 goto fail_address_parse;
255
256 ret = kstrtoul(arg, 0, &offset);
257 if (ret)
258 goto fail_address_parse;
259
260 inode = igrab(path.dentry->d_inode);
261
262 argc -= 2;
263 argv += 2;
264
265 /* setup a probe */
266 if (!event) {
267 char *tail;
268 char *ptr;
269
270 tail = kstrdup(kbasename(filename), GFP_KERNEL);
271 if (!tail) {
272 ret = -ENOMEM;
273 goto fail_address_parse;
274 }
275
276 ptr = strpbrk(tail, ".-_");
277 if (ptr)
278 *ptr = '\0';
279
280 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
281 event = buf;
282 kfree(tail);
283 }
284
285 tu = alloc_trace_uprobe(group, event, argc);
286 if (IS_ERR(tu)) {
287 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
288 ret = PTR_ERR(tu);
289 goto fail_address_parse;
290 }
291 tu->offset = offset;
292 tu->inode = inode;
293 tu->filename = kstrdup(filename, GFP_KERNEL);
294
295 if (!tu->filename) {
296 pr_info("Failed to allocate filename.\n");
297 ret = -ENOMEM;
298 goto error;
299 }
300
301 /* parse arguments */
302 ret = 0;
303 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
304 /* Increment count for freeing args in error case */
305 tu->nr_args++;
306
307 /* Parse argument name */
308 arg = strchr(argv[i], '=');
309 if (arg) {
310 *arg++ = '\0';
311 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
312 } else {
313 arg = argv[i];
314 /* If argument name is omitted, set "argN" */
315 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
316 tu->args[i].name = kstrdup(buf, GFP_KERNEL);
317 }
318
319 if (!tu->args[i].name) {
320 pr_info("Failed to allocate argument[%d] name.\n", i);
321 ret = -ENOMEM;
322 goto error;
323 }
324
325 if (!is_good_name(tu->args[i].name)) {
326 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
327 ret = -EINVAL;
328 goto error;
329 }
330
331 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
332 pr_info("Argument[%d] name '%s' conflicts with "
333 "another field.\n", i, argv[i]);
334 ret = -EINVAL;
335 goto error;
336 }
337
338 /* Parse fetch argument */
339 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
340 if (ret) {
341 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
342 goto error;
343 }
344 }
345
346 ret = register_trace_uprobe(tu);
347 if (ret)
348 goto error;
349 return 0;
350
351error:
352 free_trace_uprobe(tu);
353 return ret;
354
355fail_address_parse:
356 if (inode)
357 iput(inode);
358
359 pr_info("Failed to parse address.\n");
360
361 return ret;
362}
363
364static void cleanup_all_probes(void)
365{
366 struct trace_uprobe *tu;
367
368 mutex_lock(&uprobe_lock);
369 while (!list_empty(&uprobe_list)) {
370 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
371 unregister_trace_uprobe(tu);
372 }
373 mutex_unlock(&uprobe_lock);
374}
375
376/* Probes listing interfaces */
377static void *probes_seq_start(struct seq_file *m, loff_t *pos)
378{
379 mutex_lock(&uprobe_lock);
380 return seq_list_start(&uprobe_list, *pos);
381}
382
383static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
384{
385 return seq_list_next(v, &uprobe_list, pos);
386}
387
388static void probes_seq_stop(struct seq_file *m, void *v)
389{
390 mutex_unlock(&uprobe_lock);
391}
392
393static int probes_seq_show(struct seq_file *m, void *v)
394{
395 struct trace_uprobe *tu = v;
396 int i;
397
398 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
399 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
400
401 for (i = 0; i < tu->nr_args; i++)
402 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
403
404 seq_printf(m, "\n");
405 return 0;
406}
407
408static const struct seq_operations probes_seq_op = {
409 .start = probes_seq_start,
410 .next = probes_seq_next,
411 .stop = probes_seq_stop,
412 .show = probes_seq_show
413};
414
415static int probes_open(struct inode *inode, struct file *file)
416{
417 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
418 cleanup_all_probes();
419
420 return seq_open(file, &probes_seq_op);
421}
422
423static ssize_t probes_write(struct file *file, const char __user *buffer,
424 size_t count, loff_t *ppos)
425{
426 return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
427}
428
429static const struct file_operations uprobe_events_ops = {
430 .owner = THIS_MODULE,
431 .open = probes_open,
432 .read = seq_read,
433 .llseek = seq_lseek,
434 .release = seq_release,
435 .write = probes_write,
436};
437
438/* Probes profiling interfaces */
439static int probes_profile_seq_show(struct seq_file *m, void *v)
440{
441 struct trace_uprobe *tu = v;
442
443 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
444 return 0;
445}
446
447static const struct seq_operations profile_seq_op = {
448 .start = probes_seq_start,
449 .next = probes_seq_next,
450 .stop = probes_seq_stop,
451 .show = probes_profile_seq_show
452};
453
454static int profile_open(struct inode *inode, struct file *file)
455{
456 return seq_open(file, &profile_seq_op);
457}
458
459static const struct file_operations uprobe_profile_ops = {
460 .owner = THIS_MODULE,
461 .open = profile_open,
462 .read = seq_read,
463 .llseek = seq_lseek,
464 .release = seq_release,
465};
466
467/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{
470 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event;
472 struct ring_buffer *buffer;
473 u8 *data;
474 int size, i, pc;
475 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call;
477
478 tu->nhit++;
479
480 local_save_flags(irq_flags);
481 pc = preempt_count();
482
483 size = sizeof(*entry) + tu->size;
484
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc);
487 if (!event)
488 return;
489
490 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
492 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495
496 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
498}
499
500/* Event entry printers */
501static enum print_line_t
502print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
503{
504 struct uprobe_trace_entry_head *field;
505 struct trace_seq *s = &iter->seq;
506 struct trace_uprobe *tu;
507 u8 *data;
508 int i;
509
510 field = (struct uprobe_trace_entry_head *)iter->ent;
511 tu = container_of(event, struct trace_uprobe, call.event);
512
513 if (!trace_seq_printf(s, "%s: (", tu->call.name))
514 goto partial;
515
516 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
517 goto partial;
518
519 if (!trace_seq_puts(s, ")"))
520 goto partial;
521
522 data = (u8 *)&field[1];
523 for (i = 0; i < tu->nr_args; i++) {
524 if (!tu->args[i].type->print(s, tu->args[i].name,
525 data + tu->args[i].offset, field))
526 goto partial;
527 }
528
529 if (trace_seq_puts(s, "\n"))
530 return TRACE_TYPE_HANDLED;
531
532partial:
533 return TRACE_TYPE_PARTIAL_LINE;
534}
535
536static int probe_event_enable(struct trace_uprobe *tu, int flag)
537{
538 struct uprobe_trace_consumer *utc;
539 int ret = 0;
540
541 if (!tu->inode || tu->consumer)
542 return -EINTR;
543
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
545 if (!utc)
546 return -EINTR;
547
548 utc->cons.handler = uprobe_dispatcher;
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555
556 tu->flags |= flag;
557 utc->tu = tu;
558 tu->consumer = utc;
559
560 return 0;
561}
562
563static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{
565 if (!tu->inode || !tu->consumer)
566 return;
567
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
569 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572}
573
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
575{
576 int ret, i;
577 struct uprobe_trace_entry_head field;
578 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
579
580 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
581 /* Set argument names as fields */
582 for (i = 0; i < tu->nr_args; i++) {
583 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
584 tu->args[i].name,
585 sizeof(field) + tu->args[i].offset,
586 tu->args[i].type->size,
587 tu->args[i].type->is_signed,
588 FILTER_OTHER);
589
590 if (ret)
591 return ret;
592 }
593 return 0;
594}
595
596#define LEN_OR_ZERO (len ? len - pos : 0)
597static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
598{
599 const char *fmt, *arg;
600 int i;
601 int pos = 0;
602
603 fmt = "(%lx)";
604 arg = "REC->" FIELD_STRING_IP;
605
606 /* When len=0, we just calculate the needed length */
607
608 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
609
610 for (i = 0; i < tu->nr_args; i++) {
611 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
612 tu->args[i].name, tu->args[i].type->fmt);
613 }
614
615 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
616
617 for (i = 0; i < tu->nr_args; i++) {
618 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
619 tu->args[i].name);
620 }
621
622 return pos; /* return the length of print_fmt */
623}
624#undef LEN_OR_ZERO
625
626static int set_print_fmt(struct trace_uprobe *tu)
627{
628 char *print_fmt;
629 int len;
630
631 /* First: called with 0 length to calculate the needed length */
632 len = __set_print_fmt(tu, NULL, 0);
633 print_fmt = kmalloc(len + 1, GFP_KERNEL);
634 if (!print_fmt)
635 return -ENOMEM;
636
637 /* Second: actually write the @print_fmt */
638 __set_print_fmt(tu, print_fmt, len + 1);
639 tu->call.print_fmt = print_fmt;
640
641 return 0;
642}
643
644#ifdef CONFIG_PERF_EVENTS
645/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{
648 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry;
650 struct hlist_head *head;
651 u8 *data;
652 int size, __size, i;
653 int rctx;
654
655 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return;
660
661 preempt_disable();
662
663 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
664 if (!entry)
665 goto out;
666
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
668 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671
672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
674
675 out:
676 preempt_enable();
677}
678#endif /* CONFIG_PERF_EVENTS */
679
680static
681int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
682{
683 struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
684
685 switch (type) {
686 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE);
688
689 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE);
691 return 0;
692
693#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE);
696
697 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0;
700#endif
701 default:
702 return 0;
703 }
704 return 0;
705}
706
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu;
711
712 utc = container_of(con, struct uprobe_trace_consumer, cons);
713 tu = utc->tu;
714 if (!tu || tu->consumer != utc)
715 return 0;
716
717 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs);
719
720#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs);
723#endif
724 return 0;
725}
726
727static struct trace_event_functions uprobe_funcs = {
728 .trace = print_uprobe_event
729};
730
731static int register_uprobe_event(struct trace_uprobe *tu)
732{
733 struct ftrace_event_call *call = &tu->call;
734 int ret;
735
736 /* Initialize ftrace_event_call */
737 INIT_LIST_HEAD(&call->class->fields);
738 call->event.funcs = &uprobe_funcs;
739 call->class->define_fields = uprobe_event_define_fields;
740
741 if (set_print_fmt(tu) < 0)
742 return -ENOMEM;
743
744 ret = register_ftrace_event(&call->event);
745 if (!ret) {
746 kfree(call->print_fmt);
747 return -ENODEV;
748 }
749 call->flags = 0;
750 call->class->reg = trace_uprobe_register;
751 call->data = tu;
752 ret = trace_add_event_call(call);
753
754 if (ret) {
755 pr_info("Failed to register uprobe event: %s\n", call->name);
756 kfree(call->print_fmt);
757 unregister_ftrace_event(&call->event);
758 }
759
760 return ret;
761}
762
763static void unregister_uprobe_event(struct trace_uprobe *tu)
764{
765 /* tu->event is unregistered in trace_remove_event_call() */
766 trace_remove_event_call(&tu->call);
767 kfree(tu->call.print_fmt);
768 tu->call.print_fmt = NULL;
769}
770
771/* Make a trace interface for controling probe points */
772static __init int init_uprobe_trace(void)
773{
774 struct dentry *d_tracer;
775
776 d_tracer = tracing_init_dentry();
777 if (!d_tracer)
778 return 0;
779
780 trace_create_file("uprobe_events", 0644, d_tracer,
781 NULL, &uprobe_events_ops);
782 /* Profile interface */
783 trace_create_file("uprobe_profile", 0444, d_tracer,
784 NULL, &uprobe_profile_ops);
785 return 0;
786}
787
788fs_initcall(init_uprobe_trace);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabf..b219f1449c5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/static_key.h> 28#include <linux/jump_label.h>
29 29
30extern struct tracepoint * const __start___tracepoints_ptrs[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint * const __stop___tracepoints_ptrs[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -34,16 +34,11 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
34static const int tracepoint_debug; 34static const int tracepoint_debug;
35 35
36/* 36/*
37 * Tracepoints mutex protects the builtin and module tracepoints and the hash 37 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
38 * table, as well as the local module list. 38 * builtin and module tracepoints and the hash table.
39 */ 39 */
40static DEFINE_MUTEX(tracepoints_mutex); 40static DEFINE_MUTEX(tracepoints_mutex);
41 41
42#ifdef CONFIG_MODULES
43/* Local list of struct module */
44static LIST_HEAD(tracepoint_module_list);
45#endif /* CONFIG_MODULES */
46
47/* 42/*
48 * Tracepoint hash table, containing the active tracepoints. 43 * Tracepoint hash table, containing the active tracepoints.
49 * Protected by tracepoints_mutex. 44 * Protected by tracepoints_mutex.
@@ -256,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
256{ 251{
257 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
258 253
259 if (elem->regfunc && !static_key_enabled(&elem->key) && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
260 elem->regfunc(); 255 elem->regfunc();
261 else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
262 elem->unregfunc(); 257 elem->unregfunc();
263 258
264 /* 259 /*
@@ -269,10 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
269 * is used. 264 * is used.
270 */ 265 */
271 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
272 if (active && !static_key_enabled(&elem->key)) 267 if (active && !jump_label_enabled(&elem->key))
273 static_key_slow_inc(&elem->key); 268 jump_label_inc(&elem->key);
274 else if (!active && static_key_enabled(&elem->key)) 269 else if (!active && jump_label_enabled(&elem->key))
275 static_key_slow_dec(&elem->key); 270 jump_label_dec(&elem->key);
276} 271}
277 272
278/* 273/*
@@ -283,11 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
283 */ 278 */
284static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
285{ 280{
286 if (elem->unregfunc && static_key_enabled(&elem->key)) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
287 elem->unregfunc(); 282 elem->unregfunc();
288 283
289 if (static_key_enabled(&elem->key)) 284 if (jump_label_enabled(&elem->key))
290 static_key_slow_dec(&elem->key); 285 jump_label_dec(&elem->key);
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288
@@ -297,10 +292,9 @@ static void disable_tracepoint(struct tracepoint *elem)
297 * @end: end of the range 292 * @end: end of the range
298 * 293 *
299 * Updates the probe callback corresponding to a range of tracepoints. 294 * Updates the probe callback corresponding to a range of tracepoints.
300 * Called with tracepoints_mutex held.
301 */ 295 */
302static void tracepoint_update_probe_range(struct tracepoint * const *begin, 296void tracepoint_update_probe_range(struct tracepoint * const *begin,
303 struct tracepoint * const *end) 297 struct tracepoint * const *end)
304{ 298{
305 struct tracepoint * const *iter; 299 struct tracepoint * const *iter;
306 struct tracepoint_entry *mark_entry; 300 struct tracepoint_entry *mark_entry;
@@ -308,6 +302,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
308 if (!begin) 302 if (!begin)
309 return; 303 return;
310 304
305 mutex_lock(&tracepoints_mutex);
311 for (iter = begin; iter < end; iter++) { 306 for (iter = begin; iter < end; iter++) {
312 mark_entry = get_tracepoint((*iter)->name); 307 mark_entry = get_tracepoint((*iter)->name);
313 if (mark_entry) { 308 if (mark_entry) {
@@ -317,27 +312,11 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
317 disable_tracepoint(*iter); 312 disable_tracepoint(*iter);
318 } 313 }
319 } 314 }
315 mutex_unlock(&tracepoints_mutex);
320} 316}
321 317
322#ifdef CONFIG_MODULES
323void module_update_tracepoints(void)
324{
325 struct tp_module *tp_mod;
326
327 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
328 tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
329 tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
330}
331#else /* CONFIG_MODULES */
332void module_update_tracepoints(void)
333{
334}
335#endif /* CONFIG_MODULES */
336
337
338/* 318/*
339 * Update probes, removing the faulty probes. 319 * Update probes, removing the faulty probes.
340 * Called with tracepoints_mutex held.
341 */ 320 */
342static void tracepoint_update_probes(void) 321static void tracepoint_update_probes(void)
343{ 322{
@@ -380,12 +359,11 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
380 359
381 mutex_lock(&tracepoints_mutex); 360 mutex_lock(&tracepoints_mutex);
382 old = tracepoint_add_probe(name, probe, data); 361 old = tracepoint_add_probe(name, probe, data);
383 if (IS_ERR(old)) { 362 mutex_unlock(&tracepoints_mutex);
384 mutex_unlock(&tracepoints_mutex); 363 if (IS_ERR(old))
385 return PTR_ERR(old); 364 return PTR_ERR(old);
386 } 365
387 tracepoint_update_probes(); /* may update entry */ 366 tracepoint_update_probes(); /* may update entry */
388 mutex_unlock(&tracepoints_mutex);
389 release_probes(old); 367 release_probes(old);
390 return 0; 368 return 0;
391} 369}
@@ -424,12 +402,11 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
424 402
425 mutex_lock(&tracepoints_mutex); 403 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_remove_probe(name, probe, data); 404 old = tracepoint_remove_probe(name, probe, data);
427 if (IS_ERR(old)) { 405 mutex_unlock(&tracepoints_mutex);
428 mutex_unlock(&tracepoints_mutex); 406 if (IS_ERR(old))
429 return PTR_ERR(old); 407 return PTR_ERR(old);
430 } 408
431 tracepoint_update_probes(); /* may update entry */ 409 tracepoint_update_probes(); /* may update entry */
432 mutex_unlock(&tracepoints_mutex);
433 release_probes(old); 410 release_probes(old);
434 return 0; 411 return 0;
435} 412}
@@ -512,8 +489,9 @@ void tracepoint_probe_update_all(void)
512 if (!list_empty(&old_probes)) 489 if (!list_empty(&old_probes))
513 list_replace_init(&old_probes, &release_probes); 490 list_replace_init(&old_probes, &release_probes);
514 need_update = 0; 491 need_update = 0;
515 tracepoint_update_probes();
516 mutex_unlock(&tracepoints_mutex); 492 mutex_unlock(&tracepoints_mutex);
493
494 tracepoint_update_probes();
517 list_for_each_entry_safe(pos, next, &release_probes, u.list) { 495 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
518 list_del(&pos->u.list); 496 list_del(&pos->u.list);
519 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); 497 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -531,7 +509,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
531 * Will return the first tracepoint in the range if the input tracepoint is 509 * Will return the first tracepoint in the range if the input tracepoint is
532 * NULL. 510 * NULL.
533 */ 511 */
534static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, 512int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
535 struct tracepoint * const *begin, struct tracepoint * const *end) 513 struct tracepoint * const *begin, struct tracepoint * const *end)
536{ 514{
537 if (!*tracepoint && begin != end) { 515 if (!*tracepoint && begin != end) {
@@ -542,12 +520,11 @@ static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
542 return 1; 520 return 1;
543 return 0; 521 return 0;
544} 522}
523EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
545 524
546#ifdef CONFIG_MODULES
547static void tracepoint_get_iter(struct tracepoint_iter *iter) 525static void tracepoint_get_iter(struct tracepoint_iter *iter)
548{ 526{
549 int found = 0; 527 int found = 0;
550 struct tp_module *iter_mod;
551 528
552 /* Core kernel tracepoints */ 529 /* Core kernel tracepoints */
553 if (!iter->module) { 530 if (!iter->module) {
@@ -557,43 +534,12 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
557 if (found) 534 if (found)
558 goto end; 535 goto end;
559 } 536 }
560 /* Tracepoints in modules */ 537 /* tracepoints in modules. */
561 mutex_lock(&tracepoints_mutex); 538 found = module_get_iter_tracepoints(iter);
562 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
563 /*
564 * Sorted module list
565 */
566 if (iter_mod < iter->module)
567 continue;
568 else if (iter_mod > iter->module)
569 iter->tracepoint = NULL;
570 found = tracepoint_get_iter_range(&iter->tracepoint,
571 iter_mod->tracepoints_ptrs,
572 iter_mod->tracepoints_ptrs
573 + iter_mod->num_tracepoints);
574 if (found) {
575 iter->module = iter_mod;
576 break;
577 }
578 }
579 mutex_unlock(&tracepoints_mutex);
580end: 539end:
581 if (!found) 540 if (!found)
582 tracepoint_iter_reset(iter); 541 tracepoint_iter_reset(iter);
583} 542}
584#else /* CONFIG_MODULES */
585static void tracepoint_get_iter(struct tracepoint_iter *iter)
586{
587 int found = 0;
588
589 /* Core kernel tracepoints */
590 found = tracepoint_get_iter_range(&iter->tracepoint,
591 __start___tracepoints_ptrs,
592 __stop___tracepoints_ptrs);
593 if (!found)
594 tracepoint_iter_reset(iter);
595}
596#endif /* CONFIG_MODULES */
597 543
598void tracepoint_iter_start(struct tracepoint_iter *iter) 544void tracepoint_iter_start(struct tracepoint_iter *iter)
599{ 545{
@@ -620,99 +566,26 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
620 566
621void tracepoint_iter_reset(struct tracepoint_iter *iter) 567void tracepoint_iter_reset(struct tracepoint_iter *iter)
622{ 568{
623#ifdef CONFIG_MODULES
624 iter->module = NULL; 569 iter->module = NULL;
625#endif /* CONFIG_MODULES */
626 iter->tracepoint = NULL; 570 iter->tracepoint = NULL;
627} 571}
628EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 572EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
629 573
630#ifdef CONFIG_MODULES 574#ifdef CONFIG_MODULES
631static int tracepoint_module_coming(struct module *mod)
632{
633 struct tp_module *tp_mod, *iter;
634 int ret = 0;
635
636 /*
637 * We skip modules that taint the kernel, especially those with different
638 * module headers (for forced load), to make sure we don't cause a crash.
639 * Staging and out-of-tree GPL modules are fine.
640 */
641 if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
642 return 0;
643 mutex_lock(&tracepoints_mutex);
644 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
645 if (!tp_mod) {
646 ret = -ENOMEM;
647 goto end;
648 }
649 tp_mod->num_tracepoints = mod->num_tracepoints;
650 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
651
652 /*
653 * tracepoint_module_list is kept sorted by struct module pointer
654 * address for iteration on tracepoints from a seq_file that can release
655 * the mutex between calls.
656 */
657 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
658 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
659 if (iter < tp_mod) {
660 /* We belong to the location right after iter. */
661 list_add(&tp_mod->list, &iter->list);
662 goto module_added;
663 }
664 }
665 /* We belong to the beginning of the list */
666 list_add(&tp_mod->list, &tracepoint_module_list);
667module_added:
668 tracepoint_update_probe_range(mod->tracepoints_ptrs,
669 mod->tracepoints_ptrs + mod->num_tracepoints);
670end:
671 mutex_unlock(&tracepoints_mutex);
672 return ret;
673}
674
675static int tracepoint_module_going(struct module *mod)
676{
677 struct tp_module *pos;
678
679 mutex_lock(&tracepoints_mutex);
680 tracepoint_update_probe_range(mod->tracepoints_ptrs,
681 mod->tracepoints_ptrs + mod->num_tracepoints);
682 list_for_each_entry(pos, &tracepoint_module_list, list) {
683 if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
684 list_del(&pos->list);
685 kfree(pos);
686 break;
687 }
688 }
689 /*
690 * In the case of modules that were tainted at "coming", we'll simply
691 * walk through the list without finding it. We cannot use the "tainted"
692 * flag on "going", in case a module taints the kernel only after being
693 * loaded.
694 */
695 mutex_unlock(&tracepoints_mutex);
696 return 0;
697}
698 575
699int tracepoint_module_notify(struct notifier_block *self, 576int tracepoint_module_notify(struct notifier_block *self,
700 unsigned long val, void *data) 577 unsigned long val, void *data)
701{ 578{
702 struct module *mod = data; 579 struct module *mod = data;
703 int ret = 0;
704 580
705 switch (val) { 581 switch (val) {
706 case MODULE_STATE_COMING: 582 case MODULE_STATE_COMING:
707 ret = tracepoint_module_coming(mod);
708 break;
709 case MODULE_STATE_LIVE:
710 break;
711 case MODULE_STATE_GOING: 583 case MODULE_STATE_GOING:
712 ret = tracepoint_module_going(mod); 584 tracepoint_update_probe_range(mod->tracepoints_ptrs,
585 mod->tracepoints_ptrs + mod->num_tracepoints);
713 break; 586 break;
714 } 587 }
715 return ret; 588 return 0;
716} 589}
717 590
718struct notifier_block tracepoint_module_nb = { 591struct notifier_block tracepoint_module_nb = {
@@ -725,6 +598,7 @@ static int init_tracepoints(void)
725 return register_module_notifier(&tracepoint_module_nb); 598 return register_module_notifier(&tracepoint_module_nb);
726} 599}
727__initcall(init_tracepoints); 600__initcall(init_tracepoints);
601
728#endif /* CONFIG_MODULES */ 602#endif /* CONFIG_MODULES */
729 603
730#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 604#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b4469..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,9 +26,7 @@
26/* 26/*
27 * fill in basic accounting fields 27 * fill in basic accounting fields
28 */ 28 */
29void bacct_add_tsk(struct user_namespace *user_ns, 29void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
30 struct pid_namespace *pid_ns,
31 struct taskstats *stats, struct task_struct *tsk)
32{ 30{
33 const struct cred *tcred; 31 const struct cred *tcred;
34 struct timespec uptime, ts; 32 struct timespec uptime, ts;
@@ -57,13 +55,13 @@ void bacct_add_tsk(struct user_namespace *user_ns,
57 stats->ac_flag |= AXSIG; 55 stats->ac_flag |= AXSIG;
58 stats->ac_nice = task_nice(tsk); 56 stats->ac_nice = task_nice(tsk);
59 stats->ac_sched = tsk->policy; 57 stats->ac_sched = tsk->policy;
60 stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); 58 stats->ac_pid = tsk->pid;
61 rcu_read_lock(); 59 rcu_read_lock();
62 tcred = __task_cred(tsk); 60 tcred = __task_cred(tsk);
63 stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); 61 stats->ac_uid = tcred->uid;
64 stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); 62 stats->ac_gid = tcred->gid;
65 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
67 rcu_read_unlock(); 65 rcu_read_unlock();
68 stats->ac_utime = cputime_to_usecs(tsk->utime); 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
69 stats->ac_stime = cputime_to_usecs(tsk->stime); 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
@@ -129,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
129 127
130 local_irq_save(flags); 128 local_irq_save(flags);
131 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
132 dtime = time - tsk->acct_timexpd; 130 dtime = cputime_sub(time, tsk->acct_timexpd);
133 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
134 delta = value.tv_sec; 132 delta = value.tv_sec;
135 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb1022..51c6e89e861 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,19 +81,14 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
81 return ret; 81 return ret;
82} 82}
83 83
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) 84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
85{ 85{
86 const struct cred *cred = current_cred(); 86 const struct cred *cred = current_cred();
87 int retval; 87 int retval;
88 old_uid_t ruid, euid, suid;
89 88
90 ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); 89 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
91 euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); 90 !(retval = put_user(high2lowuid(cred->euid), euid)))
92 suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); 91 retval = put_user(high2lowuid(cred->suid), suid);
93
94 if (!(retval = put_user(ruid, ruidp)) &&
95 !(retval = put_user(euid, euidp)))
96 retval = put_user(suid, suidp);
97 92
98 return retval; 93 return retval;
99} 94}
@@ -108,19 +103,14 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
108} 103}
109 104
110 105
111SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) 106SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
112{ 107{
113 const struct cred *cred = current_cred(); 108 const struct cred *cred = current_cred();
114 int retval; 109 int retval;
115 old_gid_t rgid, egid, sgid;
116
117 rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
118 egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
119 sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
120 110
121 if (!(retval = put_user(rgid, rgidp)) && 111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
122 !(retval = put_user(egid, egidp))) 112 !(retval = put_user(high2lowgid(cred->egid), egid)))
123 retval = put_user(sgid, sgidp); 113 retval = put_user(high2lowgid(cred->sgid), sgid);
124 114
125 return retval; 115 return retval;
126} 116}
@@ -144,14 +134,11 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
144static int groups16_to_user(old_gid_t __user *grouplist, 134static int groups16_to_user(old_gid_t __user *grouplist,
145 struct group_info *group_info) 135 struct group_info *group_info)
146{ 136{
147 struct user_namespace *user_ns = current_user_ns();
148 int i; 137 int i;
149 old_gid_t group; 138 old_gid_t group;
150 kgid_t kgid;
151 139
152 for (i = 0; i < group_info->ngroups; i++) { 140 for (i = 0; i < group_info->ngroups; i++) {
153 kgid = GROUP_AT(group_info, i); 141 group = high2lowgid(GROUP_AT(group_info, i));
154 group = high2lowgid(from_kgid_munged(user_ns, kgid));
155 if (put_user(group, grouplist+i)) 142 if (put_user(group, grouplist+i))
156 return -EFAULT; 143 return -EFAULT;
157 } 144 }
@@ -162,20 +149,13 @@ static int groups16_to_user(old_gid_t __user *grouplist,
162static int groups16_from_user(struct group_info *group_info, 149static int groups16_from_user(struct group_info *group_info,
163 old_gid_t __user *grouplist) 150 old_gid_t __user *grouplist)
164{ 151{
165 struct user_namespace *user_ns = current_user_ns();
166 int i; 152 int i;
167 old_gid_t group; 153 old_gid_t group;
168 kgid_t kgid;
169 154
170 for (i = 0; i < group_info->ngroups; i++) { 155 for (i = 0; i < group_info->ngroups; i++) {
171 if (get_user(group, grouplist+i)) 156 if (get_user(group, grouplist+i))
172 return -EFAULT; 157 return -EFAULT;
173 158 GROUP_AT(group_info, i) = low2highgid(group);
174 kgid = make_kgid(user_ns, low2highgid(group));
175 if (!gid_valid(kgid))
176 return -EINVAL;
177
178 GROUP_AT(group_info, i) = kgid;
179 } 159 }
180 160
181 return 0; 161 return 0;
@@ -231,20 +211,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
231 211
232SYSCALL_DEFINE0(getuid16) 212SYSCALL_DEFINE0(getuid16)
233{ 213{
234 return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); 214 return high2lowuid(current_uid());
235} 215}
236 216
237SYSCALL_DEFINE0(geteuid16) 217SYSCALL_DEFINE0(geteuid16)
238{ 218{
239 return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); 219 return high2lowuid(current_euid());
240} 220}
241 221
242SYSCALL_DEFINE0(getgid16) 222SYSCALL_DEFINE0(getgid16)
243{ 223{
244 return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); 224 return high2lowgid(current_gid());
245} 225}
246 226
247SYSCALL_DEFINE0(getegid16) 227SYSCALL_DEFINE0(getegid16)
248{ 228{
249 return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); 229 return high2lowgid(current_egid());
250} 230}
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf..1ff27a28bb7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
4 4
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/export.h> 7#include <linux/module.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9 9
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1f..92cb706c7fc 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
2#include <linux/user-return-notifier.h> 2#include <linux/user-return-notifier.h>
3#include <linux/percpu.h> 3#include <linux/percpu.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/export.h> 5#include <linux/module.h>
6 6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); 7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8 8
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5..9e03e9c1df8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,45 +14,18 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
20 19
21/* 20/*
22 * userns count is 1 for root user, 1 for init_uts_ns, 21 * userns count is 1 for root user, 1 for init_uts_ns,
23 * and 1 for... ? 22 * and 1 for... ?
24 */ 23 */
25struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
26 .uid_map = {
27 .nr_extents = 1,
28 .extent[0] = {
29 .first = 0,
30 .lower_first = 0,
31 .count = 4294967295U,
32 },
33 },
34 .gid_map = {
35 .nr_extents = 1,
36 .extent[0] = {
37 .first = 0,
38 .lower_first = 0,
39 .count = 4294967295U,
40 },
41 },
42 .projid_map = {
43 .nr_extents = 1,
44 .extent[0] = {
45 .first = 0,
46 .lower_first = 0,
47 .count = 4294967295U,
48 },
49 },
50 .kref = { 25 .kref = {
51 .refcount = ATOMIC_INIT(3), 26 .refcount = ATOMIC_INIT(3),
52 }, 27 },
53 .owner = GLOBAL_ROOT_UID, 28 .creator = &root_user,
54 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO,
56}; 29};
57EXPORT_SYMBOL_GPL(init_user_ns); 30EXPORT_SYMBOL_GPL(init_user_ns);
58 31
@@ -61,14 +34,11 @@ EXPORT_SYMBOL_GPL(init_user_ns);
61 * when changing user ID's (ie setuid() and friends). 34 * when changing user ID's (ie setuid() and friends).
62 */ 35 */
63 36
64#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
65#define UIDHASH_SZ (1 << UIDHASH_BITS)
66#define UIDHASH_MASK (UIDHASH_SZ - 1) 37#define UIDHASH_MASK (UIDHASH_SZ - 1)
67#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 38#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
68#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) 39#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
69 40
70static struct kmem_cache *uid_cachep; 41static struct kmem_cache *uid_cachep;
71struct hlist_head uidhash_table[UIDHASH_SZ];
72 42
73/* 43/*
74 * The uidhash_lock is mostly taken from process context, but it is 44 * The uidhash_lock is mostly taken from process context, but it is
@@ -81,14 +51,14 @@ struct hlist_head uidhash_table[UIDHASH_SZ];
81 */ 51 */
82static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
83 53
84/* root_user.__count is 1, for init task cred */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
85struct user_struct root_user = { 55struct user_struct root_user = {
86 .__count = ATOMIC_INIT(1), 56 .__count = ATOMIC_INIT(2),
87 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
88 .files = ATOMIC_INIT(0), 58 .files = ATOMIC_INIT(0),
89 .sigpending = ATOMIC_INIT(0), 59 .sigpending = ATOMIC_INIT(0),
90 .locked_shm = 0, 60 .locked_shm = 0,
91 .uid = GLOBAL_ROOT_UID, 61 .user_ns = &init_user_ns,
92}; 62};
93 63
94/* 64/*
@@ -102,15 +72,16 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
102static void uid_hash_remove(struct user_struct *up) 72static void uid_hash_remove(struct user_struct *up)
103{ 73{
104 hlist_del_init(&up->uidhash_node); 74 hlist_del_init(&up->uidhash_node);
75 put_user_ns(up->user_ns);
105} 76}
106 77
107static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{ 79{
109 struct user_struct *user; 80 struct user_struct *user;
110 struct hlist_node *h; 81 struct hlist_node *h;
111 82
112 hlist_for_each_entry(user, h, hashent, uidhash_node) { 83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (uid_eq(user->uid, uid)) { 84 if (user->uid == uid) {
114 atomic_inc(&user->__count); 85 atomic_inc(&user->__count);
115 return user; 86 return user;
116 } 87 }
@@ -139,13 +110,14 @@ static void free_user(struct user_struct *up, unsigned long flags)
139 * 110 *
140 * If the user_struct could not be found, return NULL. 111 * If the user_struct could not be found, return NULL.
141 */ 112 */
142struct user_struct *find_user(kuid_t uid) 113struct user_struct *find_user(uid_t uid)
143{ 114{
144 struct user_struct *ret; 115 struct user_struct *ret;
145 unsigned long flags; 116 unsigned long flags;
117 struct user_namespace *ns = current_user_ns();
146 118
147 spin_lock_irqsave(&uidhash_lock, flags); 119 spin_lock_irqsave(&uidhash_lock, flags);
148 ret = uid_hash_find(uid, uidhashentry(uid)); 120 ret = uid_hash_find(uid, uidhashentry(ns, uid));
149 spin_unlock_irqrestore(&uidhash_lock, flags); 121 spin_unlock_irqrestore(&uidhash_lock, flags);
150 return ret; 122 return ret;
151} 123}
@@ -164,9 +136,9 @@ void free_uid(struct user_struct *up)
164 local_irq_restore(flags); 136 local_irq_restore(flags);
165} 137}
166 138
167struct user_struct *alloc_uid(kuid_t uid) 139struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
168{ 140{
169 struct hlist_head *hashent = uidhashentry(uid); 141 struct hlist_head *hashent = uidhashentry(ns, uid);
170 struct user_struct *up, *new; 142 struct user_struct *up, *new;
171 143
172 spin_lock_irq(&uidhash_lock); 144 spin_lock_irq(&uidhash_lock);
@@ -181,6 +153,8 @@ struct user_struct *alloc_uid(kuid_t uid)
181 new->uid = uid; 153 new->uid = uid;
182 atomic_set(&new->__count, 1); 154 atomic_set(&new->__count, 1);
183 155
156 new->user_ns = get_user_ns(ns);
157
184 /* 158 /*
185 * Before adding this, check whether we raced 159 * Before adding this, check whether we raced
186 * on adding the same user already.. 160 * on adding the same user already..
@@ -188,6 +162,7 @@ struct user_struct *alloc_uid(kuid_t uid)
188 spin_lock_irq(&uidhash_lock); 162 spin_lock_irq(&uidhash_lock);
189 up = uid_hash_find(uid, hashent); 163 up = uid_hash_find(uid, hashent);
190 if (up) { 164 if (up) {
165 put_user_ns(ns);
191 key_put(new->uid_keyring); 166 key_put(new->uid_keyring);
192 key_put(new->session_keyring); 167 key_put(new->session_keyring);
193 kmem_cache_free(uid_cachep, new); 168 kmem_cache_free(uid_cachep, new);
@@ -212,11 +187,11 @@ static int __init uid_cache_init(void)
212 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 187 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
213 188
214 for(n = 0; n < UIDHASH_SZ; ++n) 189 for(n = 0; n < UIDHASH_SZ; ++n)
215 INIT_HLIST_HEAD(uidhash_table + n); 190 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
216 191
217 /* Insert the root user immediately (init already runs as root) */ 192 /* Insert the root user immediately (init already runs as root) */
218 spin_lock_irq(&uidhash_lock); 193 spin_lock_irq(&uidhash_lock);
219 uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); 194 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
220 spin_unlock_irq(&uidhash_lock); 195 spin_unlock_irq(&uidhash_lock);
221 196
222 return 0; 197 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc..9da289c34f2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,46 +5,15 @@
5 * License. 5 * License.
6 */ 6 */
7 7
8#include <linux/export.h> 8#include <linux/module.h>
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
13#include <linux/highuid.h> 12#include <linux/highuid.h>
14#include <linux/cred.h> 13#include <linux/cred.h>
15#include <linux/securebits.h>
16#include <linux/keyctl.h>
17#include <linux/key-type.h>
18#include <keys/user-type.h>
19#include <linux/seq_file.h>
20#include <linux/fs.h>
21#include <linux/uaccess.h>
22#include <linux/ctype.h>
23#include <linux/projid.h>
24 14
25static struct kmem_cache *user_ns_cachep __read_mostly; 15static struct kmem_cache *user_ns_cachep __read_mostly;
26 16
27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
28 struct uid_gid_map *map);
29
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
31{
32 /* Start with the same capabilities as init but useless for doing
33 * anything as the capabilities are bound to the new user namespace.
34 */
35 cred->securebits = SECUREBITS_DEFAULT;
36 cred->cap_inheritable = CAP_EMPTY_SET;
37 cred->cap_permitted = CAP_FULL_SET;
38 cred->cap_effective = CAP_FULL_SET;
39 cred->cap_bset = CAP_FULL_SET;
40#ifdef CONFIG_KEYS
41 key_put(cred->request_key_auth);
42 cred->request_key_auth = NULL;
43#endif
44 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
45 cred->user_ns = user_ns;
46}
47
48/* 17/*
49 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
50 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -55,782 +24,111 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
55 */ 24 */
56int create_user_ns(struct cred *new) 25int create_user_ns(struct cred *new)
57{ 26{
58 struct user_namespace *ns, *parent_ns = new->user_ns; 27 struct user_namespace *ns;
59 kuid_t owner = new->euid; 28 struct user_struct *root_user;
60 kgid_t group = new->egid; 29 int n;
61 int ret;
62 30
63 /* The creator needs a mapping in the parent user namespace 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
64 * or else we won't be able to reasonably tell userspace who
65 * created a user_namespace.
66 */
67 if (!kuid_has_mapping(parent_ns, owner) ||
68 !kgid_has_mapping(parent_ns, group))
69 return -EPERM;
70
71 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
72 if (!ns) 32 if (!ns)
73 return -ENOMEM; 33 return -ENOMEM;
74 34
75 ret = proc_alloc_inum(&ns->proc_inum);
76 if (ret) {
77 kmem_cache_free(user_ns_cachep, ns);
78 return ret;
79 }
80
81 kref_init(&ns->kref); 35 kref_init(&ns->kref);
82 /* Leave the new->user_ns reference with the new user namespace. */
83 ns->parent = parent_ns;
84 ns->owner = owner;
85 ns->group = group;
86 36
87 set_cred_user_ns(new, ns); 37 for (n = 0; n < UIDHASH_SZ; ++n)
38 INIT_HLIST_HEAD(ns->uidhash_table + n);
88 39
89 return 0; 40 /* Alloc new root user. */
90} 41 root_user = alloc_uid(ns, 0);
91 42 if (!root_user) {
92int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 43 kmem_cache_free(user_ns_cachep, ns);
93{
94 struct cred *cred;
95
96 if (!(unshare_flags & CLONE_NEWUSER))
97 return 0;
98
99 cred = prepare_creds();
100 if (!cred)
101 return -ENOMEM; 44 return -ENOMEM;
102
103 *new_cred = cred;
104 return create_user_ns(cred);
105}
106
107void free_user_ns(struct kref *kref)
108{
109 struct user_namespace *parent, *ns =
110 container_of(kref, struct user_namespace, kref);
111
112 parent = ns->parent;
113 proc_free_inum(ns->proc_inum);
114 kmem_cache_free(user_ns_cachep, ns);
115 put_user_ns(parent);
116}
117EXPORT_SYMBOL(free_user_ns);
118
119static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
120{
121 unsigned idx, extents;
122 u32 first, last, id2;
123
124 id2 = id + count - 1;
125
126 /* Find the matching extent */
127 extents = map->nr_extents;
128 smp_read_barrier_depends();
129 for (idx = 0; idx < extents; idx++) {
130 first = map->extent[idx].first;
131 last = first + map->extent[idx].count - 1;
132 if (id >= first && id <= last &&
133 (id2 >= first && id2 <= last))
134 break;
135 } 45 }
136 /* Map the id or note failure */
137 if (idx < extents)
138 id = (id - first) + map->extent[idx].lower_first;
139 else
140 id = (u32) -1;
141
142 return id;
143}
144
145static u32 map_id_down(struct uid_gid_map *map, u32 id)
146{
147 unsigned idx, extents;
148 u32 first, last;
149
150 /* Find the matching extent */
151 extents = map->nr_extents;
152 smp_read_barrier_depends();
153 for (idx = 0; idx < extents; idx++) {
154 first = map->extent[idx].first;
155 last = first + map->extent[idx].count - 1;
156 if (id >= first && id <= last)
157 break;
158 }
159 /* Map the id or note failure */
160 if (idx < extents)
161 id = (id - first) + map->extent[idx].lower_first;
162 else
163 id = (u32) -1;
164
165 return id;
166}
167
168static u32 map_id_up(struct uid_gid_map *map, u32 id)
169{
170 unsigned idx, extents;
171 u32 first, last;
172
173 /* Find the matching extent */
174 extents = map->nr_extents;
175 smp_read_barrier_depends();
176 for (idx = 0; idx < extents; idx++) {
177 first = map->extent[idx].lower_first;
178 last = first + map->extent[idx].count - 1;
179 if (id >= first && id <= last)
180 break;
181 }
182 /* Map the id or note failure */
183 if (idx < extents)
184 id = (id - first) + map->extent[idx].first;
185 else
186 id = (u32) -1;
187
188 return id;
189}
190
191/**
192 * make_kuid - Map a user-namespace uid pair into a kuid.
193 * @ns: User namespace that the uid is in
194 * @uid: User identifier
195 *
196 * Maps a user-namespace uid pair into a kernel internal kuid,
197 * and returns that kuid.
198 *
199 * When there is no mapping defined for the user-namespace uid
200 * pair INVALID_UID is returned. Callers are expected to test
201 * for and handle handle INVALID_UID being returned. INVALID_UID
202 * may be tested for using uid_valid().
203 */
204kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
205{
206 /* Map the uid to a global kernel uid */
207 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
208}
209EXPORT_SYMBOL(make_kuid);
210
211/**
212 * from_kuid - Create a uid from a kuid user-namespace pair.
213 * @targ: The user namespace we want a uid in.
214 * @kuid: The kernel internal uid to start with.
215 *
216 * Map @kuid into the user-namespace specified by @targ and
217 * return the resulting uid.
218 *
219 * There is always a mapping into the initial user_namespace.
220 *
221 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
222 */
223uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
224{
225 /* Map the uid from a global kernel uid */
226 return map_id_up(&targ->uid_map, __kuid_val(kuid));
227}
228EXPORT_SYMBOL(from_kuid);
229
230/**
231 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
232 * @targ: The user namespace we want a uid in.
233 * @kuid: The kernel internal uid to start with.
234 *
235 * Map @kuid into the user-namespace specified by @targ and
236 * return the resulting uid.
237 *
238 * There is always a mapping into the initial user_namespace.
239 *
240 * Unlike from_kuid from_kuid_munged never fails and always
241 * returns a valid uid. This makes from_kuid_munged appropriate
242 * for use in syscalls like stat and getuid where failing the
243 * system call and failing to provide a valid uid are not an
244 * options.
245 *
246 * If @kuid has no mapping in @targ overflowuid is returned.
247 */
248uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
249{
250 uid_t uid;
251 uid = from_kuid(targ, kuid);
252
253 if (uid == (uid_t) -1)
254 uid = overflowuid;
255 return uid;
256}
257EXPORT_SYMBOL(from_kuid_munged);
258
259/**
260 * make_kgid - Map a user-namespace gid pair into a kgid.
261 * @ns: User namespace that the gid is in
262 * @uid: group identifier
263 *
264 * Maps a user-namespace gid pair into a kernel internal kgid,
265 * and returns that kgid.
266 *
267 * When there is no mapping defined for the user-namespace gid
268 * pair INVALID_GID is returned. Callers are expected to test
269 * for and handle INVALID_GID being returned. INVALID_GID may be
270 * tested for using gid_valid().
271 */
272kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
273{
274 /* Map the gid to a global kernel gid */
275 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
276}
277EXPORT_SYMBOL(make_kgid);
278
279/**
280 * from_kgid - Create a gid from a kgid user-namespace pair.
281 * @targ: The user namespace we want a gid in.
282 * @kgid: The kernel internal gid to start with.
283 *
284 * Map @kgid into the user-namespace specified by @targ and
285 * return the resulting gid.
286 *
287 * There is always a mapping into the initial user_namespace.
288 *
289 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
290 */
291gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
292{
293 /* Map the gid from a global kernel gid */
294 return map_id_up(&targ->gid_map, __kgid_val(kgid));
295}
296EXPORT_SYMBOL(from_kgid);
297
298/**
299 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
300 * @targ: The user namespace we want a gid in.
301 * @kgid: The kernel internal gid to start with.
302 *
303 * Map @kgid into the user-namespace specified by @targ and
304 * return the resulting gid.
305 *
306 * There is always a mapping into the initial user_namespace.
307 *
308 * Unlike from_kgid from_kgid_munged never fails and always
309 * returns a valid gid. This makes from_kgid_munged appropriate
310 * for use in syscalls like stat and getgid where failing the
311 * system call and failing to provide a valid gid are not options.
312 *
313 * If @kgid has no mapping in @targ overflowgid is returned.
314 */
315gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
316{
317 gid_t gid;
318 gid = from_kgid(targ, kgid);
319
320 if (gid == (gid_t) -1)
321 gid = overflowgid;
322 return gid;
323}
324EXPORT_SYMBOL(from_kgid_munged);
325
326/**
327 * make_kprojid - Map a user-namespace projid pair into a kprojid.
328 * @ns: User namespace that the projid is in
329 * @projid: Project identifier
330 *
331 * Maps a user-namespace uid pair into a kernel internal kuid,
332 * and returns that kuid.
333 *
334 * When there is no mapping defined for the user-namespace projid
335 * pair INVALID_PROJID is returned. Callers are expected to test
336 * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
337 * may be tested for using projid_valid().
338 */
339kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
340{
341 /* Map the uid to a global kernel uid */
342 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
343}
344EXPORT_SYMBOL(make_kprojid);
345
346/**
347 * from_kprojid - Create a projid from a kprojid user-namespace pair.
348 * @targ: The user namespace we want a projid in.
349 * @kprojid: The kernel internal project identifier to start with.
350 *
351 * Map @kprojid into the user-namespace specified by @targ and
352 * return the resulting projid.
353 *
354 * There is always a mapping into the initial user_namespace.
355 *
356 * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
357 */
358projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
359{
360 /* Map the uid from a global kernel uid */
361 return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
362}
363EXPORT_SYMBOL(from_kprojid);
364
365/**
366 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
367 * @targ: The user namespace we want a projid in.
368 * @kprojid: The kernel internal projid to start with.
369 *
370 * Map @kprojid into the user-namespace specified by @targ and
371 * return the resulting projid.
372 *
373 * There is always a mapping into the initial user_namespace.
374 *
375 * Unlike from_kprojid from_kprojid_munged never fails and always
376 * returns a valid projid. This makes from_kprojid_munged
377 * appropriate for use in syscalls like stat and where
378 * failing the system call and failing to provide a valid projid are
379 * not an options.
380 *
381 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
382 */
383projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
384{
385 projid_t projid;
386 projid = from_kprojid(targ, kprojid);
387
388 if (projid == (projid_t) -1)
389 projid = OVERFLOW_PROJID;
390 return projid;
391}
392EXPORT_SYMBOL(from_kprojid_munged);
393
394
395static int uid_m_show(struct seq_file *seq, void *v)
396{
397 struct user_namespace *ns = seq->private;
398 struct uid_gid_extent *extent = v;
399 struct user_namespace *lower_ns;
400 uid_t lower;
401
402 lower_ns = seq_user_ns(seq);
403 if ((lower_ns == ns) && lower_ns->parent)
404 lower_ns = lower_ns->parent;
405
406 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
407
408 seq_printf(seq, "%10u %10u %10u\n",
409 extent->first,
410 lower,
411 extent->count);
412
413 return 0;
414}
415
416static int gid_m_show(struct seq_file *seq, void *v)
417{
418 struct user_namespace *ns = seq->private;
419 struct uid_gid_extent *extent = v;
420 struct user_namespace *lower_ns;
421 gid_t lower;
422
423 lower_ns = seq_user_ns(seq);
424 if ((lower_ns == ns) && lower_ns->parent)
425 lower_ns = lower_ns->parent;
426
427 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
428
429 seq_printf(seq, "%10u %10u %10u\n",
430 extent->first,
431 lower,
432 extent->count);
433
434 return 0;
435}
436 46
437static int projid_m_show(struct seq_file *seq, void *v) 47 /* set the new root user in the credentials under preparation */
438{ 48 ns->creator = new->user;
439 struct user_namespace *ns = seq->private; 49 new->user = root_user;
440 struct uid_gid_extent *extent = v; 50 new->uid = new->euid = new->suid = new->fsuid = 0;
441 struct user_namespace *lower_ns; 51 new->gid = new->egid = new->sgid = new->fsgid = 0;
442 projid_t lower; 52 put_group_info(new->group_info);
443 53 new->group_info = get_group_info(&init_groups);
444 lower_ns = seq_user_ns(seq); 54#ifdef CONFIG_KEYS
445 if ((lower_ns == ns) && lower_ns->parent) 55 key_put(new->request_key_auth);
446 lower_ns = lower_ns->parent; 56 new->request_key_auth = NULL;
447 57#endif
448 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); 58 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
449 59
450 seq_printf(seq, "%10u %10u %10u\n", 60 /* root_user holds a reference to ns, our reference can be dropped */
451 extent->first, 61 put_user_ns(ns);
452 lower,
453 extent->count);
454 62
455 return 0; 63 return 0;
456} 64}
457 65
458static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 66/*
459{ 67 * Deferred destructor for a user namespace. This is required because
460 struct uid_gid_extent *extent = NULL; 68 * free_user_ns() may be called with uidhash_lock held, but we need to call
461 loff_t pos = *ppos; 69 * back to free_uid() which will want to take the lock again.
462 70 */
463 if (pos < map->nr_extents) 71static void free_user_ns_work(struct work_struct *work)
464 extent = &map->extent[pos];
465
466 return extent;
467}
468
469static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
470{
471 struct user_namespace *ns = seq->private;
472
473 return m_start(seq, ppos, &ns->uid_map);
474}
475
476static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
477{
478 struct user_namespace *ns = seq->private;
479
480 return m_start(seq, ppos, &ns->gid_map);
481}
482
483static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
484{ 72{
485 struct user_namespace *ns = seq->private; 73 struct user_namespace *ns =
486 74 container_of(work, struct user_namespace, destroyer);
487 return m_start(seq, ppos, &ns->projid_map); 75 free_uid(ns->creator);
76 kmem_cache_free(user_ns_cachep, ns);
488} 77}
489 78
490static void *m_next(struct seq_file *seq, void *v, loff_t *pos) 79void free_user_ns(struct kref *kref)
491{ 80{
492 (*pos)++; 81 struct user_namespace *ns =
493 return seq->op->start(seq, pos); 82 container_of(kref, struct user_namespace, kref);
494}
495 83
496static void m_stop(struct seq_file *seq, void *v) 84 INIT_WORK(&ns->destroyer, free_user_ns_work);
497{ 85 schedule_work(&ns->destroyer);
498 return;
499} 86}
87EXPORT_SYMBOL(free_user_ns);
500 88
501struct seq_operations proc_uid_seq_operations = { 89uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
502 .start = uid_m_start,
503 .stop = m_stop,
504 .next = m_next,
505 .show = uid_m_show,
506};
507
508struct seq_operations proc_gid_seq_operations = {
509 .start = gid_m_start,
510 .stop = m_stop,
511 .next = m_next,
512 .show = gid_m_show,
513};
514
515struct seq_operations proc_projid_seq_operations = {
516 .start = projid_m_start,
517 .stop = m_stop,
518 .next = m_next,
519 .show = projid_m_show,
520};
521
522static DEFINE_MUTEX(id_map_mutex);
523
524static ssize_t map_write(struct file *file, const char __user *buf,
525 size_t count, loff_t *ppos,
526 int cap_setid,
527 struct uid_gid_map *map,
528 struct uid_gid_map *parent_map)
529{ 90{
530 struct seq_file *seq = file->private_data; 91 struct user_namespace *tmp;
531 struct user_namespace *ns = seq->private;
532 struct uid_gid_map new_map;
533 unsigned idx;
534 struct uid_gid_extent *extent, *last = NULL;
535 unsigned long page = 0;
536 char *kbuf, *pos, *next_line;
537 ssize_t ret = -EINVAL;
538 92
539 /* 93 if (likely(to == cred->user->user_ns))
540 * The id_map_mutex serializes all writes to any given map. 94 return uid;
541 *
542 * Any map is only ever written once.
543 *
544 * An id map fits within 1 cache line on most architectures.
545 *
546 * On read nothing needs to be done unless you are on an
547 * architecture with a crazy cache coherency model like alpha.
548 *
549 * There is a one time data dependency between reading the
550 * count of the extents and the values of the extents. The
551 * desired behavior is to see the values of the extents that
552 * were written before the count of the extents.
553 *
554 * To achieve this smp_wmb() is used on guarantee the write
555 * order and smp_read_barrier_depends() is guaranteed that we
556 * don't have crazy architectures returning stale data.
557 *
558 */
559 mutex_lock(&id_map_mutex);
560 95
561 ret = -EPERM;
562 /* Only allow one successful write to the map */
563 if (map->nr_extents != 0)
564 goto out;
565 96
566 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 97 /* Is cred->user the creator of the target user_ns
567 * over the user namespace in order to set the id mapping. 98 * or the creator of one of it's parents?
568 */ 99 */
569 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) 100 for ( tmp = to; tmp != &init_user_ns;
570 goto out; 101 tmp = tmp->creator->user_ns ) {
571 102 if (cred->user == tmp->creator) {
572 /* Get a buffer */ 103 return (uid_t)0;
573 ret = -ENOMEM;
574 page = __get_free_page(GFP_TEMPORARY);
575 kbuf = (char *) page;
576 if (!page)
577 goto out;
578
579 /* Only allow <= page size writes at the beginning of the file */
580 ret = -EINVAL;
581 if ((*ppos != 0) || (count >= PAGE_SIZE))
582 goto out;
583
584 /* Slurp in the user data */
585 ret = -EFAULT;
586 if (copy_from_user(kbuf, buf, count))
587 goto out;
588 kbuf[count] = '\0';
589
590 /* Parse the user data */
591 ret = -EINVAL;
592 pos = kbuf;
593 new_map.nr_extents = 0;
594 for (;pos; pos = next_line) {
595 extent = &new_map.extent[new_map.nr_extents];
596
597 /* Find the end of line and ensure I don't look past it */
598 next_line = strchr(pos, '\n');
599 if (next_line) {
600 *next_line = '\0';
601 next_line++;
602 if (*next_line == '\0')
603 next_line = NULL;
604 } 104 }
605
606 pos = skip_spaces(pos);
607 extent->first = simple_strtoul(pos, &pos, 10);
608 if (!isspace(*pos))
609 goto out;
610
611 pos = skip_spaces(pos);
612 extent->lower_first = simple_strtoul(pos, &pos, 10);
613 if (!isspace(*pos))
614 goto out;
615
616 pos = skip_spaces(pos);
617 extent->count = simple_strtoul(pos, &pos, 10);
618 if (*pos && !isspace(*pos))
619 goto out;
620
621 /* Verify there is not trailing junk on the line */
622 pos = skip_spaces(pos);
623 if (*pos != '\0')
624 goto out;
625
626 /* Verify we have been given valid starting values */
627 if ((extent->first == (u32) -1) ||
628 (extent->lower_first == (u32) -1 ))
629 goto out;
630
631 /* Verify count is not zero and does not cause the extent to wrap */
632 if ((extent->first + extent->count) <= extent->first)
633 goto out;
634 if ((extent->lower_first + extent->count) <= extent->lower_first)
635 goto out;
636
637 /* For now only accept extents that are strictly in order */
638 if (last &&
639 (((last->first + last->count) > extent->first) ||
640 ((last->lower_first + last->count) > extent->lower_first)))
641 goto out;
642
643 new_map.nr_extents++;
644 last = extent;
645
646 /* Fail if the file contains too many extents */
647 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
648 (next_line != NULL))
649 goto out;
650 } 105 }
651 /* Be very certaint the new map actually exists */
652 if (new_map.nr_extents == 0)
653 goto out;
654
655 ret = -EPERM;
656 /* Validate the user is allowed to use user id's mapped to. */
657 if (!new_idmap_permitted(ns, cap_setid, &new_map))
658 goto out;
659
660 /* Map the lower ids from the parent user namespace to the
661 * kernel global id space.
662 */
663 for (idx = 0; idx < new_map.nr_extents; idx++) {
664 u32 lower_first;
665 extent = &new_map.extent[idx];
666 106
667 lower_first = map_id_range_down(parent_map, 107 /* No useful relationship so no mapping */
668 extent->lower_first, 108 return overflowuid;
669 extent->count);
670
671 /* Fail if we can not map the specified extent to
672 * the kernel global id space.
673 */
674 if (lower_first == (u32) -1)
675 goto out;
676
677 extent->lower_first = lower_first;
678 }
679
680 /* Install the map */
681 memcpy(map->extent, new_map.extent,
682 new_map.nr_extents*sizeof(new_map.extent[0]));
683 smp_wmb();
684 map->nr_extents = new_map.nr_extents;
685
686 *ppos = count;
687 ret = count;
688out:
689 mutex_unlock(&id_map_mutex);
690 if (page)
691 free_page(page);
692 return ret;
693} 109}
694 110
695ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 111gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
696{ 112{
697 struct seq_file *seq = file->private_data; 113 struct user_namespace *tmp;
698 struct user_namespace *ns = seq->private;
699 struct user_namespace *seq_ns = seq_user_ns(seq);
700
701 if (!ns->parent)
702 return -EPERM;
703 114
704 if ((seq_ns != ns) && (seq_ns != ns->parent)) 115 if (likely(to == cred->user->user_ns))
705 return -EPERM; 116 return gid;
706 117
707 return map_write(file, buf, size, ppos, CAP_SETUID, 118 /* Is cred->user the creator of the target user_ns
708 &ns->uid_map, &ns->parent->uid_map); 119 * or the creator of one of it's parents?
709} 120 */
710 121 for ( tmp = to; tmp != &init_user_ns;
711ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 122 tmp = tmp->creator->user_ns ) {
712{ 123 if (cred->user == tmp->creator) {
713 struct seq_file *seq = file->private_data; 124 return (gid_t)0;
714 struct user_namespace *ns = seq->private;
715 struct user_namespace *seq_ns = seq_user_ns(seq);
716
717 if (!ns->parent)
718 return -EPERM;
719
720 if ((seq_ns != ns) && (seq_ns != ns->parent))
721 return -EPERM;
722
723 return map_write(file, buf, size, ppos, CAP_SETGID,
724 &ns->gid_map, &ns->parent->gid_map);
725}
726
727ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
728{
729 struct seq_file *seq = file->private_data;
730 struct user_namespace *ns = seq->private;
731 struct user_namespace *seq_ns = seq_user_ns(seq);
732
733 if (!ns->parent)
734 return -EPERM;
735
736 if ((seq_ns != ns) && (seq_ns != ns->parent))
737 return -EPERM;
738
739 /* Anyone can set any valid project id no capability needed */
740 return map_write(file, buf, size, ppos, -1,
741 &ns->projid_map, &ns->parent->projid_map);
742}
743
744static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
745 struct uid_gid_map *new_map)
746{
747 /* Allow mapping to your own filesystem ids */
748 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
749 u32 id = new_map->extent[0].lower_first;
750 if (cap_setid == CAP_SETUID) {
751 kuid_t uid = make_kuid(ns->parent, id);
752 if (uid_eq(uid, current_fsuid()))
753 return true;
754 }
755 else if (cap_setid == CAP_SETGID) {
756 kgid_t gid = make_kgid(ns->parent, id);
757 if (gid_eq(gid, current_fsgid()))
758 return true;
759 } 125 }
760 } 126 }
761 127
762 /* Allow anyone to set a mapping that doesn't require privilege */ 128 /* No useful relationship so no mapping */
763 if (!cap_valid(cap_setid)) 129 return overflowgid;
764 return true;
765
766 /* Allow the specified ids if we have the appropriate capability
767 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
768 */
769 if (ns_capable(ns->parent, cap_setid))
770 return true;
771
772 return false;
773} 130}
774 131
775static void *userns_get(struct task_struct *task)
776{
777 struct user_namespace *user_ns;
778
779 rcu_read_lock();
780 user_ns = get_user_ns(__task_cred(task)->user_ns);
781 rcu_read_unlock();
782
783 return user_ns;
784}
785
786static void userns_put(void *ns)
787{
788 put_user_ns(ns);
789}
790
791static int userns_install(struct nsproxy *nsproxy, void *ns)
792{
793 struct user_namespace *user_ns = ns;
794 struct cred *cred;
795
796 /* Don't allow gaining capabilities by reentering
797 * the same user namespace.
798 */
799 if (user_ns == current_user_ns())
800 return -EINVAL;
801
802 /* Threaded processes may not enter a different user namespace */
803 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL;
805
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM;
808
809 cred = prepare_creds();
810 if (!cred)
811 return -ENOMEM;
812
813 put_user_ns(cred->user_ns);
814 set_cred_user_ns(cred, get_user_ns(user_ns));
815
816 return commit_creds(cred);
817}
818
819static unsigned int userns_inum(void *ns)
820{
821 struct user_namespace *user_ns = ns;
822 return user_ns->proc_inum;
823}
824
825const struct proc_ns_operations userns_operations = {
826 .name = "user",
827 .type = CLONE_NEWUSER,
828 .get = userns_get,
829 .put = userns_put,
830 .install = userns_install,
831 .inum = userns_inum,
832};
833
834static __init int user_namespaces_init(void) 132static __init int user_namespaces_init(void)
835{ 133{
836 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c48..bff131b9510 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/export.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
@@ -32,25 +32,18 @@ static struct uts_namespace *create_uts_ns(void)
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
40 39
41 ns = create_uts_ns(); 40 ns = create_uts_ns();
42 if (!ns) 41 if (!ns)
43 return ERR_PTR(-ENOMEM); 42 return ERR_PTR(-ENOMEM);
44 43
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
51 down_read(&uts_sem); 44 down_read(&uts_sem);
52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
53 ns->user_ns = get_user_ns(user_ns); 46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
54 up_read(&uts_sem); 47 up_read(&uts_sem);
55 return ns; 48 return ns;
56} 49}
@@ -62,8 +55,9 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
62 * versa. 55 * versa.
63 */ 56 */
64struct uts_namespace *copy_utsname(unsigned long flags, 57struct uts_namespace *copy_utsname(unsigned long flags,
65 struct user_namespace *user_ns, struct uts_namespace *old_ns) 58 struct task_struct *tsk)
66{ 59{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
67 struct uts_namespace *new_ns; 61 struct uts_namespace *new_ns;
68 62
69 BUG_ON(!old_ns); 63 BUG_ON(!old_ns);
@@ -72,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
72 if (!(flags & CLONE_NEWUTS)) 66 if (!(flags & CLONE_NEWUTS))
73 return old_ns; 67 return old_ns;
74 68
75 new_ns = clone_uts_ns(user_ns, old_ns); 69 new_ns = clone_uts_ns(tsk, old_ns);
76 70
77 put_uts_ns(old_ns); 71 put_uts_ns(old_ns);
78 return new_ns; 72 return new_ns;
@@ -84,7 +78,6 @@ void free_uts_ns(struct kref *kref)
84 78
85 ns = container_of(kref, struct uts_namespace, kref); 79 ns = container_of(kref, struct uts_namespace, kref);
86 put_user_ns(ns->user_ns); 80 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
88 kfree(ns); 81 kfree(ns);
89} 82}
90 83
@@ -109,32 +102,19 @@ static void utsns_put(void *ns)
109 put_uts_ns(ns); 102 put_uts_ns(ns);
110} 103}
111 104
112static int utsns_install(struct nsproxy *nsproxy, void *new) 105static int utsns_install(struct nsproxy *nsproxy, void *ns)
113{ 106{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN))
118 return -EPERM;
119
120 get_uts_ns(ns); 107 get_uts_ns(ns);
121 put_uts_ns(nsproxy->uts_ns); 108 put_uts_ns(nsproxy->uts_ns);
122 nsproxy->uts_ns = ns; 109 nsproxy->uts_ns = ns;
123 return 0; 110 return 0;
124} 111}
125 112
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
133const struct proc_ns_operations utsns_operations = { 113const struct proc_ns_operations utsns_operations = {
134 .name = "uts", 114 .name = "uts",
135 .type = CLONE_NEWUTS, 115 .type = CLONE_NEWUTS,
136 .get = utsns_get, 116 .get = utsns_get,
137 .put = utsns_put, 117 .put = utsns_put,
138 .install = utsns_install, 118 .install = utsns_install,
139 .inum = utsns_inum,
140}; 119};
120
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d82..a2cd77e70d4 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,11 +9,10 @@
9 * License. 9 * License.
10 */ 10 */
11 11
12#include <linux/export.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h>
17 16
18static void *get_uts(ctl_table *table, int write) 17static void *get_uts(ctl_table *table, int write)
19{ 18{
@@ -52,19 +51,12 @@ static int proc_do_uts_string(ctl_table *table, int write,
52 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
53 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
54 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
55
56 if (write)
57 proc_sys_poll_notify(table->poll);
58
59 return r; 54 return r;
60} 55}
61#else 56#else
62#define proc_do_uts_string NULL 57#define proc_do_uts_string NULL
63#endif 58#endif
64 59
65static DEFINE_CTL_TABLE_POLL(hostname_poll);
66static DEFINE_CTL_TABLE_POLL(domainname_poll);
67
68static struct ctl_table uts_kern_table[] = { 60static struct ctl_table uts_kern_table[] = {
69 { 61 {
70 .procname = "ostype", 62 .procname = "ostype",
@@ -93,7 +85,6 @@ static struct ctl_table uts_kern_table[] = {
93 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
94 .mode = 0644, 86 .mode = 0644,
95 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
96 .poll = &hostname_poll,
97 }, 88 },
98 { 89 {
99 .procname = "domainname", 90 .procname = "domainname",
@@ -101,7 +92,6 @@ static struct ctl_table uts_kern_table[] = {
101 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
102 .mode = 0644, 93 .mode = 0644,
103 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
104 .poll = &domainname_poll,
105 }, 95 },
106 {} 96 {}
107}; 97};
@@ -115,19 +105,6 @@ static struct ctl_table uts_root_table[] = {
115 {} 105 {}
116}; 106};
117 107
118#ifdef CONFIG_PROC_SYSCTL
119/*
120 * Notify userspace about a change in a certain entry of uts_kern_table,
121 * identified by the parameter proc.
122 */
123void uts_proc_notify(enum uts_proc proc)
124{
125 struct ctl_table *table = &uts_kern_table[proc];
126
127 proc_sys_poll_notify(table->poll);
128}
129#endif
130
131static int __init utsname_sysctl_init(void) 108static int __init utsname_sysctl_init(void)
132{ 109{
133 register_sysctl_table(uts_root_table); 110 register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ea..f45ea8d2a1c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,19 +1,19 @@
1/* 1/*
2 * Generic waiting primitives. 2 * Generic waiting primitives.
3 * 3 *
4 * (C) 2004 Nadia Yvette Chambers, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/export.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class_and_name(&q->lock, key, name); 16 lockdep_set_class(&q->lock, key);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b0..36491cd5b7d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,14 +3,15 @@
3 * 3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. 4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 * 5 *
6 * Note: Most of this code is borrowed heavily from the original softlockup 6 * this code detects hard lockups: incidents in where on a CPU
7 * detector, so thanks to Ingo for the initial implementation. 7 * the kernel does not respond to anything except NMI.
8 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks 8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
9 * to those contributors as well. 12 * to those contributors as well.
10 */ 13 */
11 14
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/cpu.h> 16#include <linux/cpu.h>
16#include <linux/nmi.h> 17#include <linux/nmi.h>
@@ -22,27 +23,22 @@
22#include <linux/notifier.h> 23#include <linux/notifier.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/sysctl.h> 25#include <linux/sysctl.h>
25#include <linux/smpboot.h>
26 26
27#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
28#include <linux/kvm_para.h>
29#include <linux/perf_event.h> 28#include <linux/perf_event.h>
30 29
31int watchdog_enabled = 1; 30int watchdog_enabled = 1;
32int __read_mostly watchdog_thresh = 10; 31int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled;
34static u64 __read_mostly sample_period;
35 32
36static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
37static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
38static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
39static DEFINE_PER_CPU(bool, softlockup_touch_sync); 36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
40static DEFINE_PER_CPU(bool, soft_watchdog_warn); 37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
43#ifdef CONFIG_HARDLOCKUP_DETECTOR 38#ifdef CONFIG_HARDLOCKUP_DETECTOR
44static DEFINE_PER_CPU(bool, hard_watchdog_warn); 39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
45static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
46static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
47static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48#endif 44#endif
@@ -117,16 +113,15 @@ static unsigned long get_timestamp(int this_cpu)
117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 113 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
118} 114}
119 115
120static void set_sample_period(void) 116static unsigned long get_sample_period(void)
121{ 117{
122 /* 118 /*
123 * convert watchdog_thresh from seconds to ns 119 * convert watchdog_thresh from seconds to ns
124 * the divide by 5 is to give hrtimer several chances (two 120 * the divide by 5 is to give hrtimer 5 chances to
125 * or three with the current relation between the soft 121 * increment before the hardlockup detector generates
126 * and hard thresholds) to increment before the 122 * a warning
127 * hardlockup detector generates a warning
128 */ 123 */
129 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
130} 125}
131 126
132/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -252,15 +247,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
252 __this_cpu_write(hard_watchdog_warn, false); 247 __this_cpu_write(hard_watchdog_warn, false);
253 return; 248 return;
254} 249}
255#endif /* CONFIG_HARDLOCKUP_DETECTOR */
256
257static void watchdog_interrupt_count(void) 250static void watchdog_interrupt_count(void)
258{ 251{
259 __this_cpu_inc(hrtimer_interrupts); 252 __this_cpu_inc(hrtimer_interrupts);
260} 253}
261 254#else
262static int watchdog_nmi_enable(unsigned int cpu); 255static inline void watchdog_interrupt_count(void) { return; }
263static void watchdog_nmi_disable(unsigned int cpu); 256#endif /* CONFIG_HARDLOCKUP_DETECTOR */
264 257
265/* watchdog kicker functions */ 258/* watchdog kicker functions */
266static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 259static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -276,7 +269,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
276 wake_up_process(__this_cpu_read(softlockup_watchdog)); 269 wake_up_process(__this_cpu_read(softlockup_watchdog));
277 270
278 /* .. and repeat */ 271 /* .. and repeat */
279 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); 272 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
280 273
281 if (touch_ts == 0) { 274 if (touch_ts == 0) {
282 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 275 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -287,9 +280,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
287 __this_cpu_write(softlockup_touch_sync, false); 280 __this_cpu_write(softlockup_touch_sync, false);
288 sched_clock_tick(); 281 sched_clock_tick();
289 } 282 }
290
291 /* Clear the guest paused flag on watchdog reset */
292 kvm_check_and_clear_guest_paused();
293 __touch_watchdog(); 283 __touch_watchdog();
294 return HRTIMER_RESTART; 284 return HRTIMER_RESTART;
295 } 285 }
@@ -302,19 +292,11 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
302 */ 292 */
303 duration = is_softlockup(touch_ts); 293 duration = is_softlockup(touch_ts);
304 if (unlikely(duration)) { 294 if (unlikely(duration)) {
305 /*
306 * If a virtual machine is stopped by the host it can look to
307 * the watchdog like a soft lockup, check to see if the host
308 * stopped the vm before we issue the warning
309 */
310 if (kvm_check_and_clear_guest_paused())
311 return HRTIMER_RESTART;
312
313 /* only warn once */ 295 /* only warn once */
314 if (__this_cpu_read(soft_watchdog_warn) == true) 296 if (__this_cpu_read(soft_watchdog_warn) == true)
315 return HRTIMER_RESTART; 297 return HRTIMER_RESTART;
316 298
317 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 299 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
318 smp_processor_id(), duration, 300 smp_processor_id(), duration,
319 current->comm, task_pid_nr(current)); 301 current->comm, task_pid_nr(current));
320 print_modules(); 302 print_modules();
@@ -333,78 +315,48 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
333 return HRTIMER_RESTART; 315 return HRTIMER_RESTART;
334} 316}
335 317
336static void watchdog_set_prio(unsigned int policy, unsigned int prio)
337{
338 struct sched_param param = { .sched_priority = prio };
339
340 sched_setscheduler(current, policy, &param);
341}
342 318
343static void watchdog_enable(unsigned int cpu) 319/*
320 * The watchdog thread - touches the timestamp.
321 */
322static int watchdog(void *unused)
344{ 323{
324 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
345 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
346 326
347 /* kick off the timer for the hardlockup detector */ 327 sched_setscheduler(current, SCHED_FIFO, &param);
348 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
349 hrtimer->function = watchdog_timer_fn;
350
351 if (!watchdog_enabled) {
352 kthread_park(current);
353 return;
354 }
355 328
356 /* Enable the perf event */ 329 /* initialize timestamp */
357 watchdog_nmi_enable(cpu); 330 __touch_watchdog();
358 331
332 /* kick off the timer for the hardlockup detector */
359 /* done here because hrtimer_start can only pin to smp_processor_id() */ 333 /* done here because hrtimer_start can only pin to smp_processor_id() */
360 hrtimer_start(hrtimer, ns_to_ktime(sample_period), 334 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
361 HRTIMER_MODE_REL_PINNED); 335 HRTIMER_MODE_REL_PINNED);
362 336
363 /* initialize timestamp */ 337 set_current_state(TASK_INTERRUPTIBLE);
364 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); 338 /*
365 __touch_watchdog(); 339 * Run briefly once per second to reset the softlockup timestamp.
366} 340 * If this gets delayed for more than 60 seconds then the
341 * debug-printout triggers in watchdog_timer_fn().
342 */
343 while (!kthread_should_stop()) {
344 __touch_watchdog();
345 schedule();
367 346
368static void watchdog_disable(unsigned int cpu) 347 if (kthread_should_stop())
369{ 348 break;
370 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
371 349
372 watchdog_set_prio(SCHED_NORMAL, 0); 350 set_current_state(TASK_INTERRUPTIBLE);
373 hrtimer_cancel(hrtimer); 351 }
374 /* disable the perf event */ 352 __set_current_state(TASK_RUNNING);
375 watchdog_nmi_disable(cpu);
376}
377 353
378static int watchdog_should_run(unsigned int cpu) 354 return 0;
379{
380 return __this_cpu_read(hrtimer_interrupts) !=
381 __this_cpu_read(soft_lockup_hrtimer_cnt);
382} 355}
383 356
384/*
385 * The watchdog thread function - touches the timestamp.
386 *
387 * It only runs once every sample_period seconds (4 seconds by
388 * default) to reset the softlockup timestamp. If this gets delayed
389 * for more than 2*watchdog_thresh seconds then the debug-printout
390 * triggers in watchdog_timer_fn().
391 */
392static void watchdog(unsigned int cpu)
393{
394 __this_cpu_write(soft_lockup_hrtimer_cnt,
395 __this_cpu_read(hrtimer_interrupts));
396 __touch_watchdog();
397}
398 357
399#ifdef CONFIG_HARDLOCKUP_DETECTOR 358#ifdef CONFIG_HARDLOCKUP_DETECTOR
400/* 359static int watchdog_nmi_enable(int cpu)
401 * People like the simple clean cpu node info on boot.
402 * Reduce the watchdog noise by only printing messages
403 * that are different from what cpu0 displayed.
404 */
405static unsigned long cpu0_err;
406
407static int watchdog_nmi_enable(unsigned int cpu)
408{ 360{
409 struct perf_event_attr *wd_attr; 361 struct perf_event_attr *wd_attr;
410 struct perf_event *event = per_cpu(watchdog_ev, cpu); 362 struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -422,31 +374,19 @@ static int watchdog_nmi_enable(unsigned int cpu)
422 374
423 /* Try to register using hardware perf events */ 375 /* Try to register using hardware perf events */
424 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
425
426 /* save cpu0 error for future comparision */
427 if (cpu == 0 && IS_ERR(event))
428 cpu0_err = PTR_ERR(event);
429
430 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
431 /* only print for cpu0 or different than cpu0 */ 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
432 if (cpu == 0 || cpu0_err)
433 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
434 goto out_save; 379 goto out_save;
435 } 380 }
436 381
437 /* skip displaying the same error again */
438 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
439 return PTR_ERR(event);
440 382
441 /* vary the KERN level based on the returned errno */ 383 /* vary the KERN level based on the returned errno */
442 if (PTR_ERR(event) == -EOPNOTSUPP) 384 if (PTR_ERR(event) == -EOPNOTSUPP)
443 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 385 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
444 else if (PTR_ERR(event) == -ENOENT) 386 else if (PTR_ERR(event) == -ENOENT)
445 pr_warning("disabled (cpu%i): hardware events not enabled\n", 387 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
446 cpu);
447 else 388 else
448 pr_err("disabled (cpu%i): unable to create perf event: %ld\n", 389 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
449 cpu, PTR_ERR(event));
450 return PTR_ERR(event); 390 return PTR_ERR(event);
451 391
452 /* success path */ 392 /* success path */
@@ -458,7 +398,7 @@ out:
458 return 0; 398 return 0;
459} 399}
460 400
461static void watchdog_nmi_disable(unsigned int cpu) 401static void watchdog_nmi_disable(int cpu)
462{ 402{
463 struct perf_event *event = per_cpu(watchdog_ev, cpu); 403 struct perf_event *event = per_cpu(watchdog_ev, cpu);
464 404
@@ -472,35 +412,105 @@ static void watchdog_nmi_disable(unsigned int cpu)
472 return; 412 return;
473} 413}
474#else 414#else
475static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 415static int watchdog_nmi_enable(int cpu) { return 0; }
476static void watchdog_nmi_disable(unsigned int cpu) { return; } 416static void watchdog_nmi_disable(int cpu) { return; }
477#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 417#endif /* CONFIG_HARDLOCKUP_DETECTOR */
478 418
479/* prepare/enable/disable routines */ 419/* prepare/enable/disable routines */
480/* sysctl functions */ 420static void watchdog_prepare_cpu(int cpu)
481#ifdef CONFIG_SYSCTL
482static void watchdog_enable_all_cpus(void)
483{ 421{
484 unsigned int cpu; 422 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
485 423
486 if (watchdog_disabled) { 424 WARN_ON(per_cpu(softlockup_watchdog, cpu));
487 watchdog_disabled = 0; 425 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
488 for_each_online_cpu(cpu) 426 hrtimer->function = watchdog_timer_fn;
489 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 427}
428
429static int watchdog_enable(int cpu)
430{
431 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
432 int err = 0;
433
434 /* enable the perf event */
435 err = watchdog_nmi_enable(cpu);
436
437 /* Regardless of err above, fall through and start softlockup */
438
439 /* create the watchdog thread */
440 if (!p) {
441 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
442 if (IS_ERR(p)) {
443 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
444 if (!err) {
445 /* if hardlockup hasn't already set this */
446 err = PTR_ERR(p);
447 /* and disable the perf event */
448 watchdog_nmi_disable(cpu);
449 }
450 goto out;
451 }
452 kthread_bind(p, cpu);
453 per_cpu(watchdog_touch_ts, cpu) = 0;
454 per_cpu(softlockup_watchdog, cpu) = p;
455 wake_up_process(p);
490 } 456 }
457
458out:
459 return err;
491} 460}
492 461
493static void watchdog_disable_all_cpus(void) 462static void watchdog_disable(int cpu)
494{ 463{
495 unsigned int cpu; 464 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
465 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
466
467 /*
468 * cancel the timer first to stop incrementing the stats
469 * and waking up the kthread
470 */
471 hrtimer_cancel(hrtimer);
472
473 /* disable the perf event */
474 watchdog_nmi_disable(cpu);
496 475
497 if (!watchdog_disabled) { 476 /* stop the watchdog thread */
498 watchdog_disabled = 1; 477 if (p) {
499 for_each_online_cpu(cpu) 478 per_cpu(softlockup_watchdog, cpu) = NULL;
500 kthread_park(per_cpu(softlockup_watchdog, cpu)); 479 kthread_stop(p);
501 } 480 }
502} 481}
503 482
483static void watchdog_enable_all_cpus(void)
484{
485 int cpu;
486
487 watchdog_enabled = 0;
488
489 for_each_online_cpu(cpu)
490 if (!watchdog_enable(cpu))
491 /* if any cpu succeeds, watchdog is considered
492 enabled for the system */
493 watchdog_enabled = 1;
494
495 if (!watchdog_enabled)
496 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
497
498}
499
500static void watchdog_disable_all_cpus(void)
501{
502 int cpu;
503
504 for_each_online_cpu(cpu)
505 watchdog_disable(cpu);
506
507 /* if all watchdogs are disabled, then they are disabled for the system */
508 watchdog_enabled = 0;
509}
510
511
512/* sysctl functions */
513#ifdef CONFIG_SYSCTL
504/* 514/*
505 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 515 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
506 */ 516 */
@@ -510,38 +520,73 @@ int proc_dowatchdog(struct ctl_table *table, int write,
510{ 520{
511 int ret; 521 int ret;
512 522
513 if (watchdog_disabled < 0)
514 return -ENODEV;
515
516 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 523 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
517 if (ret || !write) 524 if (ret || !write)
518 return ret; 525 goto out;
519 526
520 set_sample_period();
521 if (watchdog_enabled && watchdog_thresh) 527 if (watchdog_enabled && watchdog_thresh)
522 watchdog_enable_all_cpus(); 528 watchdog_enable_all_cpus();
523 else 529 else
524 watchdog_disable_all_cpus(); 530 watchdog_disable_all_cpus();
525 531
532out:
526 return ret; 533 return ret;
527} 534}
528#endif /* CONFIG_SYSCTL */ 535#endif /* CONFIG_SYSCTL */
529 536
530static struct smp_hotplug_thread watchdog_threads = { 537
531 .store = &softlockup_watchdog, 538/*
532 .thread_should_run = watchdog_should_run, 539 * Create/destroy watchdog threads as CPUs come and go:
533 .thread_fn = watchdog, 540 */
534 .thread_comm = "watchdog/%u", 541static int __cpuinit
535 .setup = watchdog_enable, 542cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
536 .park = watchdog_disable, 543{
537 .unpark = watchdog_enable, 544 int hotcpu = (unsigned long)hcpu;
545
546 switch (action) {
547 case CPU_UP_PREPARE:
548 case CPU_UP_PREPARE_FROZEN:
549 watchdog_prepare_cpu(hotcpu);
550 break;
551 case CPU_ONLINE:
552 case CPU_ONLINE_FROZEN:
553 if (watchdog_enabled)
554 watchdog_enable(hotcpu);
555 break;
556#ifdef CONFIG_HOTPLUG_CPU
557 case CPU_UP_CANCELED:
558 case CPU_UP_CANCELED_FROZEN:
559 watchdog_disable(hotcpu);
560 break;
561 case CPU_DEAD:
562 case CPU_DEAD_FROZEN:
563 watchdog_disable(hotcpu);
564 break;
565#endif /* CONFIG_HOTPLUG_CPU */
566 }
567
568 /*
569 * hardlockup and softlockup are not important enough
570 * to block cpu bring up. Just always succeed and
571 * rely on printk output to flag problems.
572 */
573 return NOTIFY_OK;
574}
575
576static struct notifier_block __cpuinitdata cpu_nfb = {
577 .notifier_call = cpu_callback
538}; 578};
539 579
540void __init lockup_detector_init(void) 580void __init lockup_detector_init(void)
541{ 581{
542 set_sample_period(); 582 void *cpu = (void *)(long)smp_processor_id();
543 if (smpboot_register_percpu_thread(&watchdog_threads)) { 583 int err;
544 pr_err("Failed to create watchdog threads, disabled\n"); 584
545 watchdog_disabled = -ENODEV; 585 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
546 } 586 WARN_ON(notifier_to_errno(err));
587
588 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
589 register_cpu_notifier(&cpu_nfb);
590
591 return;
547} 592}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c..1783aabc612 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
23 * Please read Documentation/workqueue.txt for details. 23 * Please read Documentation/workqueue.txt for details.
24 */ 24 */
25 25
26#include <linux/export.h> 26#include <linux/module.h>
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/init.h> 29#include <linux/init.h>
@@ -45,41 +45,32 @@
45#include "workqueue_sched.h" 45#include "workqueue_sched.h"
46 46
47enum { 47enum {
48 /* 48 /* global_cwq flags */
49 * global_cwq flags 49 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
50 * 50 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
51 * A bound gcwq is either associated or disassociated with its CPU. 51 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
52 * While associated (!DISASSOCIATED), all workers are bound to the 52 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
53 * CPU and none has %WORKER_UNBOUND set and concurrency management 53 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
54 * is in effect.
55 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 *
60 * Note that DISASSOCIATED can be flipped only while holding
61 * assoc_mutex of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress.
63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
70 54
71 /* worker flags */ 55 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */ 56 WORKER_STARTED = 1 << 0, /* started */
73 WORKER_DIE = 1 << 1, /* die die die */ 57 WORKER_DIE = 1 << 1, /* die die die */
74 WORKER_IDLE = 1 << 2, /* is idle */ 58 WORKER_IDLE = 1 << 2, /* is idle */
75 WORKER_PREP = 1 << 3, /* preparing to run works */ 59 WORKER_PREP = 1 << 3, /* preparing to run works */
60 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
61 WORKER_REBIND = 1 << 5, /* mom is home, come back */
76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 62 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 63 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
78 64
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 65 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
80 WORKER_CPU_INTENSIVE, 66 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
81 67
82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 68 /* gcwq->trustee_state */
69 TRUSTEE_START = 0, /* start */
70 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
71 TRUSTEE_BUTCHER = 2, /* butcher workers */
72 TRUSTEE_RELEASE = 3, /* release workers */
73 TRUSTEE_DONE = 4, /* trustee is done */
83 74
84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 75 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, 76 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -93,13 +84,13 @@ enum {
93 (min two ticks) */ 84 (min two ticks) */
94 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
95 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
96 88
97 /* 89 /*
98 * Rescue workers are used only on emergencies and shared by 90 * Rescue workers are used only on emergencies and shared by
99 * all cpus. Give -20. 91 * all cpus. Give -20.
100 */ 92 */
101 RESCUER_NICE_LEVEL = -20, 93 RESCUER_NICE_LEVEL = -20,
102 HIGHPRI_NICE_LEVEL = -20,
103}; 94};
104 95
105/* 96/*
@@ -124,7 +115,6 @@ enum {
124 */ 115 */
125 116
126struct global_cwq; 117struct global_cwq;
127struct worker_pool;
128 118
129/* 119/*
130 * The poor guys doing the actual heavy lifting. All on-duty workers 120 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -141,32 +131,12 @@ struct worker {
141 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ 131 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142 struct list_head scheduled; /* L: scheduled works */ 132 struct list_head scheduled; /* L: scheduled works */
143 struct task_struct *task; /* I: worker task */ 133 struct task_struct *task; /* I: worker task */
144 struct worker_pool *pool; /* I: the associated pool */ 134 struct global_cwq *gcwq; /* I: the associated gcwq */
145 /* 64 bytes boundary on 64bit, 32 on 32bit */ 135 /* 64 bytes boundary on 64bit, 32 on 32bit */
146 unsigned long last_active; /* L: last active timestamp */ 136 unsigned long last_active; /* L: last active timestamp */
147 unsigned int flags; /* X: flags */ 137 unsigned int flags; /* X: flags */
148 int id; /* I: worker id */ 138 int id; /* I: worker id */
149 139 struct work_struct rebind_work; /* L: rebind worker to cpu */
150 /* for rebinding worker to CPU */
151 struct work_struct rebind_work; /* L: for busy worker */
152};
153
154struct worker_pool {
155 struct global_cwq *gcwq; /* I: the owning gcwq */
156 unsigned int flags; /* X: flags */
157
158 struct list_head worklist; /* L: list of pending works */
159 int nr_workers; /* L: total number of workers */
160
161 /* nr_idle includes the ones off idle_list for rebinding */
162 int nr_idle; /* L: currently idle ones */
163
164 struct list_head idle_list; /* X: list of idle workers */
165 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */
167
168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
169 struct ida worker_ida; /* L: for worker IDs */
170}; 140};
171 141
172/* 142/*
@@ -176,15 +146,27 @@ struct worker_pool {
176 */ 146 */
177struct global_cwq { 147struct global_cwq {
178 spinlock_t lock; /* the gcwq lock */ 148 spinlock_t lock; /* the gcwq lock */
149 struct list_head worklist; /* L: list of pending works */
179 unsigned int cpu; /* I: the associated cpu */ 150 unsigned int cpu; /* I: the associated cpu */
180 unsigned int flags; /* L: GCWQ_* flags */ 151 unsigned int flags; /* L: GCWQ_* flags */
181 152
182 /* workers are chained either in busy_hash or pool idle_list */ 153 int nr_workers; /* L: total number of workers */
154 int nr_idle; /* L: currently idle ones */
155
156 /* workers are chained either in the idle_list or busy_hash */
157 struct list_head idle_list; /* X: list of idle workers */
183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 158 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
184 /* L: hash of busy workers */ 159 /* L: hash of busy workers */
185 160
186 struct worker_pool pools[NR_WORKER_POOLS]; 161 struct timer_list idle_timer; /* L: worker idle timeout */
187 /* normal and highpri pools */ 162 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
163
164 struct ida worker_ida; /* L: for worker IDs */
165
166 struct task_struct *trustee; /* L: for gcwq shutdown */
167 unsigned int trustee_state; /* L: trustee state */
168 wait_queue_head_t trustee_wait; /* trustee wait */
169 struct worker *first_idle; /* L: first idle worker */
188} ____cacheline_aligned_in_smp; 170} ____cacheline_aligned_in_smp;
189 171
190/* 172/*
@@ -193,7 +175,7 @@ struct global_cwq {
193 * aligned at two's power of the number of flag bits. 175 * aligned at two's power of the number of flag bits.
194 */ 176 */
195struct cpu_workqueue_struct { 177struct cpu_workqueue_struct {
196 struct worker_pool *pool; /* I: the associated pool */ 178 struct global_cwq *gcwq; /* I: the associated gcwq */
197 struct workqueue_struct *wq; /* I: the owning workqueue */ 179 struct workqueue_struct *wq; /* I: the owning workqueue */
198 int work_color; /* L: current color */ 180 int work_color; /* L: current color */
199 int flush_color; /* L: flushing color */ 181 int flush_color; /* L: flushing color */
@@ -260,30 +242,26 @@ struct workqueue_struct {
260 242
261 int nr_drainers; /* W: drain in progress */ 243 int nr_drainers; /* W: drain in progress */
262 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
263#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
264 struct lockdep_map lockdep_map; 247 struct lockdep_map lockdep_map;
265#endif 248#endif
266 char name[]; /* I: workqueue name */
267}; 249};
268 250
269struct workqueue_struct *system_wq __read_mostly; 251struct workqueue_struct *system_wq __read_mostly;
270EXPORT_SYMBOL_GPL(system_wq);
271struct workqueue_struct *system_highpri_wq __read_mostly;
272EXPORT_SYMBOL_GPL(system_highpri_wq);
273struct workqueue_struct *system_long_wq __read_mostly; 252struct workqueue_struct *system_long_wq __read_mostly;
274EXPORT_SYMBOL_GPL(system_long_wq); 253struct workqueue_struct *system_nrt_wq __read_mostly;
275struct workqueue_struct *system_unbound_wq __read_mostly; 254struct workqueue_struct *system_unbound_wq __read_mostly;
276EXPORT_SYMBOL_GPL(system_unbound_wq);
277struct workqueue_struct *system_freezable_wq __read_mostly; 255struct workqueue_struct *system_freezable_wq __read_mostly;
256EXPORT_SYMBOL_GPL(system_wq);
257EXPORT_SYMBOL_GPL(system_long_wq);
258EXPORT_SYMBOL_GPL(system_nrt_wq);
259EXPORT_SYMBOL_GPL(system_unbound_wq);
278EXPORT_SYMBOL_GPL(system_freezable_wq); 260EXPORT_SYMBOL_GPL(system_freezable_wq);
279 261
280#define CREATE_TRACE_POINTS 262#define CREATE_TRACE_POINTS
281#include <trace/events/workqueue.h> 263#include <trace/events/workqueue.h>
282 264
283#define for_each_worker_pool(pool, gcwq) \
284 for ((pool) = &(gcwq)->pools[0]; \
285 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
286
287#define for_each_busy_worker(worker, i, pos, gcwq) \ 265#define for_each_busy_worker(worker, i, pos, gcwq) \
288 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 266 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
289 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 267 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -464,7 +442,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */
464 * try_to_wake_up(). Put it in a separate cacheline. 442 * try_to_wake_up(). Put it in a separate cacheline.
465 */ 443 */
466static DEFINE_PER_CPU(struct global_cwq, global_cwq); 444static DEFINE_PER_CPU(struct global_cwq, global_cwq);
467static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); 445static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
468 446
469/* 447/*
470 * Global cpu workqueue and nr_running counter for unbound gcwq. The 448 * Global cpu workqueue and nr_running counter for unbound gcwq. The
@@ -472,17 +450,10 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS])
472 * workers have WORKER_UNBOUND set. 450 * workers have WORKER_UNBOUND set.
473 */ 451 */
474static struct global_cwq unbound_global_cwq; 452static struct global_cwq unbound_global_cwq;
475static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { 453static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
476 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477};
478 454
479static int worker_thread(void *__worker); 455static int worker_thread(void *__worker);
480 456
481static int worker_pool_pri(struct worker_pool *pool)
482{
483 return pool - pool->gcwq->pools;
484}
485
486static struct global_cwq *get_gcwq(unsigned int cpu) 457static struct global_cwq *get_gcwq(unsigned int cpu)
487{ 458{
488 if (cpu != WORK_CPU_UNBOUND) 459 if (cpu != WORK_CPU_UNBOUND)
@@ -491,23 +462,25 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
491 return &unbound_global_cwq; 462 return &unbound_global_cwq;
492} 463}
493 464
494static atomic_t *get_pool_nr_running(struct worker_pool *pool) 465static atomic_t *get_gcwq_nr_running(unsigned int cpu)
495{ 466{
496 int cpu = pool->gcwq->cpu;
497 int idx = worker_pool_pri(pool);
498
499 if (cpu != WORK_CPU_UNBOUND) 467 if (cpu != WORK_CPU_UNBOUND)
500 return &per_cpu(pool_nr_running, cpu)[idx]; 468 return &per_cpu(gcwq_nr_running, cpu);
501 else 469 else
502 return &unbound_pool_nr_running[idx]; 470 return &unbound_gcwq_nr_running;
503} 471}
504 472
505static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 473static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
506 struct workqueue_struct *wq) 474 struct workqueue_struct *wq)
507{ 475{
508 if (!(wq->flags & WQ_UNBOUND)) { 476 if (!(wq->flags & WQ_UNBOUND)) {
509 if (likely(cpu < nr_cpu_ids)) 477 if (likely(cpu < nr_cpu_ids)) {
478#ifdef CONFIG_SMP
510 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 479 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
480#else
481 return wq->cpu_wq.single;
482#endif
483 }
511 } else if (likely(cpu == WORK_CPU_UNBOUND)) 484 } else if (likely(cpu == WORK_CPU_UNBOUND))
512 return wq->cpu_wq.single; 485 return wq->cpu_wq.single;
513 return NULL; 486 return NULL;
@@ -530,24 +503,18 @@ static int work_next_color(int color)
530} 503}
531 504
532/* 505/*
533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data 506 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
534 * contain the pointer to the queued cwq. Once execution starts, the flag 507 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
535 * is cleared and the high bits contain OFFQ flags and CPU number. 508 * cleared and the work data contains the cpu number it was last on.
536 *
537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
538 * and clear_work_data() can be used to set the cwq, cpu or clear
539 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set.
541 * 509 *
542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to 510 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
543 * a work. gcwq is available once the work has been queued anywhere after 511 * cwq, cpu or clear work->data. These functions should only be
544 * initialization until it is sync canceled. cwq is available only while 512 * called while the work is owned - ie. while the PENDING bit is set.
545 * the work item is queued.
546 * 513 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 514 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
548 * canceled. While being canceled, a work item may have its PENDING set 515 * corresponding to a work. gcwq is available once the work has been
549 * but stay off timer and worklist for arbitrarily long and nobody should 516 * queued anywhere after initialization. cwq is available only from
550 * try to steal the PENDING bit. 517 * queueing until execution starts.
551 */ 518 */
552static inline void set_work_data(struct work_struct *work, unsigned long data, 519static inline void set_work_data(struct work_struct *work, unsigned long data,
553 unsigned long flags) 520 unsigned long flags)
@@ -564,22 +531,13 @@ static void set_work_cwq(struct work_struct *work,
564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 531 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
565} 532}
566 533
567static void set_work_cpu_and_clear_pending(struct work_struct *work, 534static void set_work_cpu(struct work_struct *work, unsigned int cpu)
568 unsigned int cpu)
569{ 535{
570 /* 536 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
571 * The following wmb is paired with the implied mb in
572 * test_and_set_bit(PENDING) and ensures all updates to @work made
573 * here are visible to and precede any updates by the next PENDING
574 * owner.
575 */
576 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
578} 537}
579 538
580static void clear_work_data(struct work_struct *work) 539static void clear_work_data(struct work_struct *work)
581{ 540{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */
583 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 541 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
584} 542}
585 543
@@ -600,9 +558,9 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
600 558
601 if (data & WORK_STRUCT_CWQ) 559 if (data & WORK_STRUCT_CWQ)
602 return ((struct cpu_workqueue_struct *) 560 return ((struct cpu_workqueue_struct *)
603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 561 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
604 562
605 cpu = data >> WORK_OFFQ_CPU_SHIFT; 563 cpu = data >> WORK_STRUCT_FLAG_BITS;
606 if (cpu == WORK_CPU_NONE) 564 if (cpu == WORK_CPU_NONE)
607 return NULL; 565 return NULL;
608 566
@@ -610,86 +568,61 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
610 return get_gcwq(cpu); 568 return get_gcwq(cpu);
611} 569}
612 570
613static void mark_work_canceling(struct work_struct *work)
614{
615 struct global_cwq *gcwq = get_work_gcwq(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
619 WORK_STRUCT_PENDING);
620}
621
622static bool work_is_canceling(struct work_struct *work)
623{
624 unsigned long data = atomic_long_read(&work->data);
625
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
627}
628
629/* 571/*
630 * Policy functions. These define the policies on how the global worker 572 * Policy functions. These define the policies on how the global
631 * pools are managed. Unless noted otherwise, these functions assume that 573 * worker pool is managed. Unless noted otherwise, these functions
632 * they're being called with gcwq->lock held. 574 * assume that they're being called with gcwq->lock held.
633 */ 575 */
634 576
635static bool __need_more_worker(struct worker_pool *pool) 577static bool __need_more_worker(struct global_cwq *gcwq)
636{ 578{
637 return !atomic_read(get_pool_nr_running(pool)); 579 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
580 gcwq->flags & GCWQ_HIGHPRI_PENDING;
638} 581}
639 582
640/* 583/*
641 * Need to wake up a worker? Called from anything but currently 584 * Need to wake up a worker? Called from anything but currently
642 * running workers. 585 * running workers.
643 *
644 * Note that, because unbound workers never contribute to nr_running, this
645 * function will always return %true for unbound gcwq as long as the
646 * worklist isn't empty.
647 */ 586 */
648static bool need_more_worker(struct worker_pool *pool) 587static bool need_more_worker(struct global_cwq *gcwq)
649{ 588{
650 return !list_empty(&pool->worklist) && __need_more_worker(pool); 589 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
651} 590}
652 591
653/* Can I start working? Called from busy but !running workers. */ 592/* Can I start working? Called from busy but !running workers. */
654static bool may_start_working(struct worker_pool *pool) 593static bool may_start_working(struct global_cwq *gcwq)
655{ 594{
656 return pool->nr_idle; 595 return gcwq->nr_idle;
657} 596}
658 597
659/* Do I need to keep working? Called from currently running workers. */ 598/* Do I need to keep working? Called from currently running workers. */
660static bool keep_working(struct worker_pool *pool) 599static bool keep_working(struct global_cwq *gcwq)
661{ 600{
662 atomic_t *nr_running = get_pool_nr_running(pool); 601 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
663 602
664 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; 603 return !list_empty(&gcwq->worklist) &&
604 (atomic_read(nr_running) <= 1 ||
605 gcwq->flags & GCWQ_HIGHPRI_PENDING);
665} 606}
666 607
667/* Do we need a new worker? Called from manager. */ 608/* Do we need a new worker? Called from manager. */
668static bool need_to_create_worker(struct worker_pool *pool) 609static bool need_to_create_worker(struct global_cwq *gcwq)
669{ 610{
670 return need_more_worker(pool) && !may_start_working(pool); 611 return need_more_worker(gcwq) && !may_start_working(gcwq);
671} 612}
672 613
673/* Do I need to be the manager? */ 614/* Do I need to be the manager? */
674static bool need_to_manage_workers(struct worker_pool *pool) 615static bool need_to_manage_workers(struct global_cwq *gcwq)
675{ 616{
676 return need_to_create_worker(pool) || 617 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
677 (pool->flags & POOL_MANAGE_WORKERS);
678} 618}
679 619
680/* Do we have too many workers and should some go away? */ 620/* Do we have too many workers and should some go away? */
681static bool too_many_workers(struct worker_pool *pool) 621static bool too_many_workers(struct global_cwq *gcwq)
682{ 622{
683 bool managing = pool->flags & POOL_MANAGING_WORKERS; 623 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
684 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 624 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
685 int nr_busy = pool->nr_workers - nr_idle; 625 int nr_busy = gcwq->nr_workers - nr_idle;
686
687 /*
688 * nr_idle and idle_list may disagree if idle rebinding is in
689 * progress. Never return %true if idle_list is empty.
690 */
691 if (list_empty(&pool->idle_list))
692 return false;
693 626
694 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 627 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
695} 628}
@@ -699,26 +632,26 @@ static bool too_many_workers(struct worker_pool *pool)
699 */ 632 */
700 633
701/* Return the first worker. Safe with preemption disabled */ 634/* Return the first worker. Safe with preemption disabled */
702static struct worker *first_worker(struct worker_pool *pool) 635static struct worker *first_worker(struct global_cwq *gcwq)
703{ 636{
704 if (unlikely(list_empty(&pool->idle_list))) 637 if (unlikely(list_empty(&gcwq->idle_list)))
705 return NULL; 638 return NULL;
706 639
707 return list_first_entry(&pool->idle_list, struct worker, entry); 640 return list_first_entry(&gcwq->idle_list, struct worker, entry);
708} 641}
709 642
710/** 643/**
711 * wake_up_worker - wake up an idle worker 644 * wake_up_worker - wake up an idle worker
712 * @pool: worker pool to wake worker from 645 * @gcwq: gcwq to wake worker for
713 * 646 *
714 * Wake up the first idle worker of @pool. 647 * Wake up the first idle worker of @gcwq.
715 * 648 *
716 * CONTEXT: 649 * CONTEXT:
717 * spin_lock_irq(gcwq->lock). 650 * spin_lock_irq(gcwq->lock).
718 */ 651 */
719static void wake_up_worker(struct worker_pool *pool) 652static void wake_up_worker(struct global_cwq *gcwq)
720{ 653{
721 struct worker *worker = first_worker(pool); 654 struct worker *worker = first_worker(gcwq);
722 655
723 if (likely(worker)) 656 if (likely(worker))
724 wake_up_process(worker->task); 657 wake_up_process(worker->task);
@@ -739,10 +672,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
739{ 672{
740 struct worker *worker = kthread_data(task); 673 struct worker *worker = kthread_data(task);
741 674
742 if (!(worker->flags & WORKER_NOT_RUNNING)) { 675 if (!(worker->flags & WORKER_NOT_RUNNING))
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); 676 atomic_inc(get_gcwq_nr_running(cpu));
744 atomic_inc(get_pool_nr_running(worker->pool));
745 }
746} 677}
747 678
748/** 679/**
@@ -764,8 +695,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
764 unsigned int cpu) 695 unsigned int cpu)
765{ 696{
766 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 697 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
767 struct worker_pool *pool = worker->pool; 698 struct global_cwq *gcwq = get_gcwq(cpu);
768 atomic_t *nr_running = get_pool_nr_running(pool); 699 atomic_t *nr_running = get_gcwq_nr_running(cpu);
769 700
770 if (worker->flags & WORKER_NOT_RUNNING) 701 if (worker->flags & WORKER_NOT_RUNNING)
771 return NULL; 702 return NULL;
@@ -778,14 +709,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
778 * worklist not empty test sequence is in insert_work(). 709 * worklist not empty test sequence is in insert_work().
779 * Please read comment there. 710 * Please read comment there.
780 * 711 *
781 * NOT_RUNNING is clear. This means that we're bound to and 712 * NOT_RUNNING is clear. This means that trustee is not in
782 * running on the local cpu w/ rq lock held and preemption 713 * charge and we're running on the local cpu w/ rq lock held
783 * disabled, which in turn means that none else could be 714 * and preemption disabled, which in turn means that none else
784 * manipulating idle_list, so dereferencing idle_list without gcwq 715 * could be manipulating idle_list, so dereferencing idle_list
785 * lock is safe. 716 * without gcwq lock is safe.
786 */ 717 */
787 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 718 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
788 to_wakeup = first_worker(pool); 719 to_wakeup = first_worker(gcwq);
789 return to_wakeup ? to_wakeup->task : NULL; 720 return to_wakeup ? to_wakeup->task : NULL;
790} 721}
791 722
@@ -805,7 +736,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
805static inline void worker_set_flags(struct worker *worker, unsigned int flags, 736static inline void worker_set_flags(struct worker *worker, unsigned int flags,
806 bool wakeup) 737 bool wakeup)
807{ 738{
808 struct worker_pool *pool = worker->pool; 739 struct global_cwq *gcwq = worker->gcwq;
809 740
810 WARN_ON_ONCE(worker->task != current); 741 WARN_ON_ONCE(worker->task != current);
811 742
@@ -816,12 +747,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
816 */ 747 */
817 if ((flags & WORKER_NOT_RUNNING) && 748 if ((flags & WORKER_NOT_RUNNING) &&
818 !(worker->flags & WORKER_NOT_RUNNING)) { 749 !(worker->flags & WORKER_NOT_RUNNING)) {
819 atomic_t *nr_running = get_pool_nr_running(pool); 750 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
820 751
821 if (wakeup) { 752 if (wakeup) {
822 if (atomic_dec_and_test(nr_running) && 753 if (atomic_dec_and_test(nr_running) &&
823 !list_empty(&pool->worklist)) 754 !list_empty(&gcwq->worklist))
824 wake_up_worker(pool); 755 wake_up_worker(gcwq);
825 } else 756 } else
826 atomic_dec(nr_running); 757 atomic_dec(nr_running);
827 } 758 }
@@ -841,7 +772,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
841 */ 772 */
842static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 773static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
843{ 774{
844 struct worker_pool *pool = worker->pool; 775 struct global_cwq *gcwq = worker->gcwq;
845 unsigned int oflags = worker->flags; 776 unsigned int oflags = worker->flags;
846 777
847 WARN_ON_ONCE(worker->task != current); 778 WARN_ON_ONCE(worker->task != current);
@@ -855,7 +786,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
855 */ 786 */
856 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 787 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
857 if (!(worker->flags & WORKER_NOT_RUNNING)) 788 if (!(worker->flags & WORKER_NOT_RUNNING))
858 atomic_inc(get_pool_nr_running(pool)); 789 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
859} 790}
860 791
861/** 792/**
@@ -939,203 +870,40 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
939} 870}
940 871
941/** 872/**
942 * move_linked_works - move linked works to a list 873 * gcwq_determine_ins_pos - find insertion position
943 * @work: start of series of works to be scheduled 874 * @gcwq: gcwq of interest
944 * @head: target list to append @work to 875 * @cwq: cwq a work is being queued for
945 * @nextp: out paramter for nested worklist walking
946 *
947 * Schedule linked works starting from @work to @head. Work series to
948 * be scheduled starts at @work and includes any consecutive work with
949 * WORK_STRUCT_LINKED set in its predecessor.
950 *
951 * If @nextp is not NULL, it's updated to point to the next work of
952 * the last scheduled work. This allows move_linked_works() to be
953 * nested inside outer list_for_each_entry_safe().
954 *
955 * CONTEXT:
956 * spin_lock_irq(gcwq->lock).
957 */
958static void move_linked_works(struct work_struct *work, struct list_head *head,
959 struct work_struct **nextp)
960{
961 struct work_struct *n;
962
963 /*
964 * Linked worklist will always end before the end of the list,
965 * use NULL for list head.
966 */
967 list_for_each_entry_safe_from(work, n, NULL, entry) {
968 list_move_tail(&work->entry, head);
969 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
970 break;
971 }
972
973 /*
974 * If we're already inside safe list traversal and have moved
975 * multiple works to the scheduled queue, the next position
976 * needs to be updated.
977 */
978 if (nextp)
979 *nextp = n;
980}
981
982static void cwq_activate_delayed_work(struct work_struct *work)
983{
984 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
985
986 trace_workqueue_activate_work(work);
987 move_linked_works(work, &cwq->pool->worklist, NULL);
988 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
989 cwq->nr_active++;
990}
991
992static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
993{
994 struct work_struct *work = list_first_entry(&cwq->delayed_works,
995 struct work_struct, entry);
996
997 cwq_activate_delayed_work(work);
998}
999
1000/**
1001 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1002 * @cwq: cwq of interest
1003 * @color: color of work which left the queue
1004 * 876 *
1005 * A work either has completed or is removed from pending queue, 877 * A work for @cwq is about to be queued on @gcwq, determine insertion
1006 * decrement nr_in_flight of its cwq and handle workqueue flushing. 878 * position for the work. If @cwq is for HIGHPRI wq, the work is
879 * queued at the head of the queue but in FIFO order with respect to
880 * other HIGHPRI works; otherwise, at the end of the queue. This
881 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
882 * there are HIGHPRI works pending.
1007 * 883 *
1008 * CONTEXT: 884 * CONTEXT:
1009 * spin_lock_irq(gcwq->lock). 885 * spin_lock_irq(gcwq->lock).
1010 */
1011static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1012{
1013 /* ignore uncolored works */
1014 if (color == WORK_NO_COLOR)
1015 return;
1016
1017 cwq->nr_in_flight[color]--;
1018
1019 cwq->nr_active--;
1020 if (!list_empty(&cwq->delayed_works)) {
1021 /* one down, submit a delayed one */
1022 if (cwq->nr_active < cwq->max_active)
1023 cwq_activate_first_delayed(cwq);
1024 }
1025
1026 /* is flush in progress and are we at the flushing tip? */
1027 if (likely(cwq->flush_color != color))
1028 return;
1029
1030 /* are there still in-flight works? */
1031 if (cwq->nr_in_flight[color])
1032 return;
1033
1034 /* this cwq is done, clear flush_color */
1035 cwq->flush_color = -1;
1036
1037 /*
1038 * If this was the last cwq, wake up the first flusher. It
1039 * will handle the rest.
1040 */
1041 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1042 complete(&cwq->wq->first_flusher->done);
1043}
1044
1045/**
1046 * try_to_grab_pending - steal work item from worklist and disable irq
1047 * @work: work item to steal
1048 * @is_dwork: @work is a delayed_work
1049 * @flags: place to store irq state
1050 *
1051 * Try to grab PENDING bit of @work. This function can handle @work in any
1052 * stable state - idle, on timer or on worklist. Return values are
1053 *
1054 * 1 if @work was pending and we successfully stole PENDING
1055 * 0 if @work was idle and we claimed PENDING
1056 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
1057 * -ENOENT if someone else is canceling @work, this state may persist
1058 * for arbitrarily long
1059 * 886 *
1060 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 887 * RETURNS:
1061 * interrupted while holding PENDING and @work off queue, irq must be 888 * Pointer to inserstion position.
1062 * disabled on entry. This, combined with delayed_work->timer being
1063 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1064 *
1065 * On successful return, >= 0, irq is disabled and the caller is
1066 * responsible for releasing it using local_irq_restore(*@flags).
1067 *
1068 * This function is safe to call from any context including IRQ handler.
1069 */ 889 */
1070static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 890static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
1071 unsigned long *flags) 891 struct cpu_workqueue_struct *cwq)
1072{ 892{
1073 struct global_cwq *gcwq; 893 struct work_struct *twork;
1074 894
1075 local_irq_save(*flags); 895 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
896 return &gcwq->worklist;
1076 897
1077 /* try to steal the timer if it exists */ 898 list_for_each_entry(twork, &gcwq->worklist, entry) {
1078 if (is_dwork) { 899 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
1079 struct delayed_work *dwork = to_delayed_work(work);
1080 900
1081 /* 901 if (!(tcwq->wq->flags & WQ_HIGHPRI))
1082 * dwork->timer is irqsafe. If del_timer() fails, it's 902 break;
1083 * guaranteed that the timer is not queued anywhere and not
1084 * running on the local CPU.
1085 */
1086 if (likely(del_timer(&dwork->timer)))
1087 return 1;
1088 } 903 }
1089 904
1090 /* try to claim PENDING the normal way */ 905 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
1091 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) 906 return &twork->entry;
1092 return 0;
1093
1094 /*
1095 * The queueing is in progress, or it is already queued. Try to
1096 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1097 */
1098 gcwq = get_work_gcwq(work);
1099 if (!gcwq)
1100 goto fail;
1101
1102 spin_lock(&gcwq->lock);
1103 if (!list_empty(&work->entry)) {
1104 /*
1105 * This work is queued, but perhaps we locked the wrong gcwq.
1106 * In that case we must see the new value after rmb(), see
1107 * insert_work()->wmb().
1108 */
1109 smp_rmb();
1110 if (gcwq == get_work_gcwq(work)) {
1111 debug_work_deactivate(work);
1112
1113 /*
1114 * A delayed work item cannot be grabbed directly
1115 * because it might have linked NO_COLOR work items
1116 * which, if left on the delayed_list, will confuse
1117 * cwq->nr_active management later on and cause
1118 * stall. Make sure the work item is activated
1119 * before grabbing.
1120 */
1121 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1122 cwq_activate_delayed_work(work);
1123
1124 list_del_init(&work->entry);
1125 cwq_dec_nr_in_flight(get_work_cwq(work),
1126 get_work_color(work));
1127
1128 spin_unlock(&gcwq->lock);
1129 return 1;
1130 }
1131 }
1132 spin_unlock(&gcwq->lock);
1133fail:
1134 local_irq_restore(*flags);
1135 if (work_is_canceling(work))
1136 return -ENOENT;
1137 cpu_relax();
1138 return -EAGAIN;
1139} 907}
1140 908
1141/** 909/**
@@ -1155,7 +923,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1155 struct work_struct *work, struct list_head *head, 923 struct work_struct *work, struct list_head *head,
1156 unsigned int extra_flags) 924 unsigned int extra_flags)
1157{ 925{
1158 struct worker_pool *pool = cwq->pool; 926 struct global_cwq *gcwq = cwq->gcwq;
1159 927
1160 /* we own @work, set data and link */ 928 /* we own @work, set data and link */
1161 set_work_cwq(work, cwq, extra_flags); 929 set_work_cwq(work, cwq, extra_flags);
@@ -1175,8 +943,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1175 */ 943 */
1176 smp_mb(); 944 smp_mb();
1177 945
1178 if (__need_more_worker(pool)) 946 if (__need_more_worker(gcwq))
1179 wake_up_worker(pool); 947 wake_up_worker(gcwq);
1180} 948}
1181 949
1182/* 950/*
@@ -1218,15 +986,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1218 struct cpu_workqueue_struct *cwq; 986 struct cpu_workqueue_struct *cwq;
1219 struct list_head *worklist; 987 struct list_head *worklist;
1220 unsigned int work_flags; 988 unsigned int work_flags;
1221 unsigned int req_cpu = cpu; 989 unsigned long flags;
1222
1223 /*
1224 * While a work item is PENDING && off queue, a task trying to
1225 * steal the PENDING will busy-loop waiting for it to either get
1226 * queued or lose PENDING. Grabbing PENDING and queueing should
1227 * happen with IRQ disabled.
1228 */
1229 WARN_ON_ONCE(!irqs_disabled());
1230 990
1231 debug_work_activate(work); 991 debug_work_activate(work);
1232 992
@@ -1239,22 +999,21 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1239 if (!(wq->flags & WQ_UNBOUND)) { 999 if (!(wq->flags & WQ_UNBOUND)) {
1240 struct global_cwq *last_gcwq; 1000 struct global_cwq *last_gcwq;
1241 1001
1242 if (cpu == WORK_CPU_UNBOUND) 1002 if (unlikely(cpu == WORK_CPU_UNBOUND))
1243 cpu = raw_smp_processor_id(); 1003 cpu = raw_smp_processor_id();
1244 1004
1245 /* 1005 /*
1246 * It's multi cpu. If @work was previously on a different 1006 * It's multi cpu. If @wq is non-reentrant and @work
1247 * cpu, it might still be running there, in which case the 1007 * was previously on a different cpu, it might still
1248 * work needs to be queued on that cpu to guarantee 1008 * be running there, in which case the work needs to
1249 * non-reentrancy. 1009 * be queued on that cpu to guarantee non-reentrance.
1250 */ 1010 */
1251 gcwq = get_gcwq(cpu); 1011 gcwq = get_gcwq(cpu);
1252 last_gcwq = get_work_gcwq(work); 1012 if (wq->flags & WQ_NON_REENTRANT &&
1253 1013 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
1254 if (last_gcwq && last_gcwq != gcwq) {
1255 struct worker *worker; 1014 struct worker *worker;
1256 1015
1257 spin_lock(&last_gcwq->lock); 1016 spin_lock_irqsave(&last_gcwq->lock, flags);
1258 1017
1259 worker = find_worker_executing_work(last_gcwq, work); 1018 worker = find_worker_executing_work(last_gcwq, work);
1260 1019
@@ -1262,25 +1021,21 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1262 gcwq = last_gcwq; 1021 gcwq = last_gcwq;
1263 else { 1022 else {
1264 /* meh... not running there, queue here */ 1023 /* meh... not running there, queue here */
1265 spin_unlock(&last_gcwq->lock); 1024 spin_unlock_irqrestore(&last_gcwq->lock, flags);
1266 spin_lock(&gcwq->lock); 1025 spin_lock_irqsave(&gcwq->lock, flags);
1267 } 1026 }
1268 } else { 1027 } else
1269 spin_lock(&gcwq->lock); 1028 spin_lock_irqsave(&gcwq->lock, flags);
1270 }
1271 } else { 1029 } else {
1272 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1030 gcwq = get_gcwq(WORK_CPU_UNBOUND);
1273 spin_lock(&gcwq->lock); 1031 spin_lock_irqsave(&gcwq->lock, flags);
1274 } 1032 }
1275 1033
1276 /* gcwq determined, get cwq and queue */ 1034 /* gcwq determined, get cwq and queue */
1277 cwq = get_cwq(gcwq->cpu, wq); 1035 cwq = get_cwq(gcwq->cpu, wq);
1278 trace_workqueue_queue_work(req_cpu, cwq, work); 1036 trace_workqueue_queue_work(cpu, cwq, work);
1279 1037
1280 if (WARN_ON(!list_empty(&work->entry))) { 1038 BUG_ON(!list_empty(&work->entry));
1281 spin_unlock(&gcwq->lock);
1282 return;
1283 }
1284 1039
1285 cwq->nr_in_flight[cwq->work_color]++; 1040 cwq->nr_in_flight[cwq->work_color]++;
1286 work_flags = work_color_to_flags(cwq->work_color); 1041 work_flags = work_color_to_flags(cwq->work_color);
@@ -1288,7 +1043,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1288 if (likely(cwq->nr_active < cwq->max_active)) { 1043 if (likely(cwq->nr_active < cwq->max_active)) {
1289 trace_workqueue_activate_work(work); 1044 trace_workqueue_activate_work(work);
1290 cwq->nr_active++; 1045 cwq->nr_active++;
1291 worklist = &cwq->pool->worklist; 1046 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1292 } else { 1047 } else {
1293 work_flags |= WORK_STRUCT_DELAYED; 1048 work_flags |= WORK_STRUCT_DELAYED;
1294 worklist = &cwq->delayed_works; 1049 worklist = &cwq->delayed_works;
@@ -1296,152 +1051,61 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1296 1051
1297 insert_work(cwq, work, worklist, work_flags); 1052 insert_work(cwq, work, worklist, work_flags);
1298 1053
1299 spin_unlock(&gcwq->lock); 1054 spin_unlock_irqrestore(&gcwq->lock, flags);
1300} 1055}
1301 1056
1302/** 1057/**
1303 * queue_work_on - queue work on specific cpu 1058 * queue_work - queue work on a workqueue
1304 * @cpu: CPU number to execute work on
1305 * @wq: workqueue to use 1059 * @wq: workqueue to use
1306 * @work: work to queue 1060 * @work: work to queue
1307 * 1061 *
1308 * Returns %false if @work was already on a queue, %true otherwise. 1062 * Returns 0 if @work was already on a queue, non-zero otherwise.
1309 * 1063 *
1310 * We queue the work to a specific CPU, the caller must ensure it 1064 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1311 * can't go away. 1065 * it can be processed by another CPU.
1312 */ 1066 */
1313bool queue_work_on(int cpu, struct workqueue_struct *wq, 1067int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1314 struct work_struct *work)
1315{ 1068{
1316 bool ret = false; 1069 int ret;
1317 unsigned long flags;
1318
1319 local_irq_save(flags);
1320 1070
1321 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1071 ret = queue_work_on(get_cpu(), wq, work);
1322 __queue_work(cpu, wq, work); 1072 put_cpu();
1323 ret = true;
1324 }
1325 1073
1326 local_irq_restore(flags);
1327 return ret; 1074 return ret;
1328} 1075}
1329EXPORT_SYMBOL_GPL(queue_work_on); 1076EXPORT_SYMBOL_GPL(queue_work);
1330 1077
1331/** 1078/**
1332 * queue_work - queue work on a workqueue 1079 * queue_work_on - queue work on specific cpu
1080 * @cpu: CPU number to execute work on
1333 * @wq: workqueue to use 1081 * @wq: workqueue to use
1334 * @work: work to queue 1082 * @work: work to queue
1335 * 1083 *
1336 * Returns %false if @work was already on a queue, %true otherwise. 1084 * Returns 0 if @work was already on a queue, non-zero otherwise.
1337 * 1085 *
1338 * We queue the work to the CPU on which it was submitted, but if the CPU dies 1086 * We queue the work to a specific CPU, the caller must ensure it
1339 * it can be processed by another CPU. 1087 * can't go away.
1340 */ 1088 */
1341bool queue_work(struct workqueue_struct *wq, struct work_struct *work) 1089int
1090queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1342{ 1091{
1343 return queue_work_on(WORK_CPU_UNBOUND, wq, work); 1092 int ret = 0;
1344}
1345EXPORT_SYMBOL_GPL(queue_work);
1346
1347void delayed_work_timer_fn(unsigned long __data)
1348{
1349 struct delayed_work *dwork = (struct delayed_work *)__data;
1350 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1351
1352 /* should have been called from irqsafe timer with irq already off */
1353 __queue_work(dwork->cpu, cwq->wq, &dwork->work);
1354}
1355EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
1356
1357static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1358 struct delayed_work *dwork, unsigned long delay)
1359{
1360 struct timer_list *timer = &dwork->timer;
1361 struct work_struct *work = &dwork->work;
1362 unsigned int lcpu;
1363
1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1365 timer->data != (unsigned long)dwork);
1366 WARN_ON_ONCE(timer_pending(timer));
1367 WARN_ON_ONCE(!list_empty(&work->entry));
1368 1093
1369 /* 1094 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1370 * If @delay is 0, queue @dwork->work immediately. This is for 1095 __queue_work(cpu, wq, work);
1371 * both optimization and correctness. The earliest @timer can 1096 ret = 1;
1372 * expire is on the closest next tick and delayed_work users depend
1373 * on that there's no such delay when @delay is 0.
1374 */
1375 if (!delay) {
1376 __queue_work(cpu, wq, &dwork->work);
1377 return;
1378 }
1379
1380 timer_stats_timer_set_start_info(&dwork->timer);
1381
1382 /*
1383 * This stores cwq for the moment, for the timer_fn. Note that the
1384 * work's gcwq is preserved to allow reentrance detection for
1385 * delayed works.
1386 */
1387 if (!(wq->flags & WQ_UNBOUND)) {
1388 struct global_cwq *gcwq = get_work_gcwq(work);
1389
1390 /*
1391 * If we cannot get the last gcwq from @work directly,
1392 * select the last CPU such that it avoids unnecessarily
1393 * triggering non-reentrancy check in __queue_work().
1394 */
1395 lcpu = cpu;
1396 if (gcwq)
1397 lcpu = gcwq->cpu;
1398 if (lcpu == WORK_CPU_UNBOUND)
1399 lcpu = raw_smp_processor_id();
1400 } else {
1401 lcpu = WORK_CPU_UNBOUND;
1402 } 1097 }
1403 1098 return ret;
1404 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1405
1406 dwork->cpu = cpu;
1407 timer->expires = jiffies + delay;
1408
1409 if (unlikely(cpu != WORK_CPU_UNBOUND))
1410 add_timer_on(timer, cpu);
1411 else
1412 add_timer(timer);
1413} 1099}
1100EXPORT_SYMBOL_GPL(queue_work_on);
1414 1101
1415/** 1102static void delayed_work_timer_fn(unsigned long __data)
1416 * queue_delayed_work_on - queue work on specific CPU after delay
1417 * @cpu: CPU number to execute work on
1418 * @wq: workqueue to use
1419 * @dwork: work to queue
1420 * @delay: number of jiffies to wait before queueing
1421 *
1422 * Returns %false if @work was already on a queue, %true otherwise. If
1423 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1424 * execution.
1425 */
1426bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1427 struct delayed_work *dwork, unsigned long delay)
1428{ 1103{
1429 struct work_struct *work = &dwork->work; 1104 struct delayed_work *dwork = (struct delayed_work *)__data;
1430 bool ret = false; 1105 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1431 unsigned long flags;
1432
1433 /* read the comment in __queue_work() */
1434 local_irq_save(flags);
1435
1436 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1437 __queue_delayed_work(cpu, wq, dwork, delay);
1438 ret = true;
1439 }
1440 1106
1441 local_irq_restore(flags); 1107 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1442 return ret;
1443} 1108}
1444EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1445 1109
1446/** 1110/**
1447 * queue_delayed_work - queue work on a workqueue after delay 1111 * queue_delayed_work - queue work on a workqueue after delay
@@ -1449,67 +1113,72 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1449 * @dwork: delayable work to queue 1113 * @dwork: delayable work to queue
1450 * @delay: number of jiffies to wait before queueing 1114 * @delay: number of jiffies to wait before queueing
1451 * 1115 *
1452 * Equivalent to queue_delayed_work_on() but tries to use the local CPU. 1116 * Returns 0 if @work was already on a queue, non-zero otherwise.
1453 */ 1117 */
1454bool queue_delayed_work(struct workqueue_struct *wq, 1118int queue_delayed_work(struct workqueue_struct *wq,
1455 struct delayed_work *dwork, unsigned long delay) 1119 struct delayed_work *dwork, unsigned long delay)
1456{ 1120{
1457 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); 1121 if (delay == 0)
1122 return queue_work(wq, &dwork->work);
1123
1124 return queue_delayed_work_on(-1, wq, dwork, delay);
1458} 1125}
1459EXPORT_SYMBOL_GPL(queue_delayed_work); 1126EXPORT_SYMBOL_GPL(queue_delayed_work);
1460 1127
1461/** 1128/**
1462 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1129 * queue_delayed_work_on - queue work on specific CPU after delay
1463 * @cpu: CPU number to execute work on 1130 * @cpu: CPU number to execute work on
1464 * @wq: workqueue to use 1131 * @wq: workqueue to use
1465 * @dwork: work to queue 1132 * @dwork: work to queue
1466 * @delay: number of jiffies to wait before queueing 1133 * @delay: number of jiffies to wait before queueing
1467 * 1134 *
1468 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, 1135 * Returns 0 if @work was already on a queue, non-zero otherwise.
1469 * modify @dwork's timer so that it expires after @delay. If @delay is
1470 * zero, @work is guaranteed to be scheduled immediately regardless of its
1471 * current state.
1472 *
1473 * Returns %false if @dwork was idle and queued, %true if @dwork was
1474 * pending and its timer was modified.
1475 *
1476 * This function is safe to call from any context including IRQ handler.
1477 * See try_to_grab_pending() for details.
1478 */ 1136 */
1479bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, 1137int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1480 struct delayed_work *dwork, unsigned long delay) 1138 struct delayed_work *dwork, unsigned long delay)
1481{ 1139{
1482 unsigned long flags; 1140 int ret = 0;
1483 int ret; 1141 struct timer_list *timer = &dwork->timer;
1142 struct work_struct *work = &dwork->work;
1484 1143
1485 do { 1144 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1486 ret = try_to_grab_pending(&dwork->work, true, &flags); 1145 unsigned int lcpu;
1487 } while (unlikely(ret == -EAGAIN));
1488 1146
1489 if (likely(ret >= 0)) { 1147 BUG_ON(timer_pending(timer));
1490 __queue_delayed_work(cpu, wq, dwork, delay); 1148 BUG_ON(!list_empty(&work->entry));
1491 local_irq_restore(flags);
1492 }
1493 1149
1494 /* -ENOENT from try_to_grab_pending() becomes %true */ 1150 timer_stats_timer_set_start_info(&dwork->timer);
1495 return ret;
1496}
1497EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1498 1151
1499/** 1152 /*
1500 * mod_delayed_work - modify delay of or queue a delayed work 1153 * This stores cwq for the moment, for the timer_fn.
1501 * @wq: workqueue to use 1154 * Note that the work's gcwq is preserved to allow
1502 * @dwork: work to queue 1155 * reentrance detection for delayed works.
1503 * @delay: number of jiffies to wait before queueing 1156 */
1504 * 1157 if (!(wq->flags & WQ_UNBOUND)) {
1505 * mod_delayed_work_on() on local CPU. 1158 struct global_cwq *gcwq = get_work_gcwq(work);
1506 */ 1159
1507bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, 1160 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1508 unsigned long delay) 1161 lcpu = gcwq->cpu;
1509{ 1162 else
1510 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); 1163 lcpu = raw_smp_processor_id();
1164 } else
1165 lcpu = WORK_CPU_UNBOUND;
1166
1167 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1168
1169 timer->expires = jiffies + delay;
1170 timer->data = (unsigned long)dwork;
1171 timer->function = delayed_work_timer_fn;
1172
1173 if (unlikely(cpu >= 0))
1174 add_timer_on(timer, cpu);
1175 else
1176 add_timer(timer);
1177 ret = 1;
1178 }
1179 return ret;
1511} 1180}
1512EXPORT_SYMBOL_GPL(mod_delayed_work); 1181EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1513 1182
1514/** 1183/**
1515 * worker_enter_idle - enter idle state 1184 * worker_enter_idle - enter idle state
@@ -1523,8 +1192,7 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
1523 */ 1192 */
1524static void worker_enter_idle(struct worker *worker) 1193static void worker_enter_idle(struct worker *worker)
1525{ 1194{
1526 struct worker_pool *pool = worker->pool; 1195 struct global_cwq *gcwq = worker->gcwq;
1527 struct global_cwq *gcwq = pool->gcwq;
1528 1196
1529 BUG_ON(worker->flags & WORKER_IDLE); 1197 BUG_ON(worker->flags & WORKER_IDLE);
1530 BUG_ON(!list_empty(&worker->entry) && 1198 BUG_ON(!list_empty(&worker->entry) &&
@@ -1532,24 +1200,22 @@ static void worker_enter_idle(struct worker *worker)
1532 1200
1533 /* can't use worker_set_flags(), also called from start_worker() */ 1201 /* can't use worker_set_flags(), also called from start_worker() */
1534 worker->flags |= WORKER_IDLE; 1202 worker->flags |= WORKER_IDLE;
1535 pool->nr_idle++; 1203 gcwq->nr_idle++;
1536 worker->last_active = jiffies; 1204 worker->last_active = jiffies;
1537 1205
1538 /* idle_list is LIFO */ 1206 /* idle_list is LIFO */
1539 list_add(&worker->entry, &pool->idle_list); 1207 list_add(&worker->entry, &gcwq->idle_list);
1540 1208
1541 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 1209 if (likely(!(worker->flags & WORKER_ROGUE))) {
1542 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1210 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1211 mod_timer(&gcwq->idle_timer,
1212 jiffies + IDLE_WORKER_TIMEOUT);
1213 } else
1214 wake_up_all(&gcwq->trustee_wait);
1543 1215
1544 /* 1216 /* sanity check nr_running */
1545 * Sanity check nr_running. Because gcwq_unbind_fn() releases 1217 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1546 * gcwq->lock between setting %WORKER_UNBOUND and zapping 1218 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1547 * nr_running, the warning may trigger spuriously. Check iff
1548 * unbind is not in progress.
1549 */
1550 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1551 pool->nr_workers == pool->nr_idle &&
1552 atomic_read(get_pool_nr_running(pool)));
1553} 1219}
1554 1220
1555/** 1221/**
@@ -1563,11 +1229,11 @@ static void worker_enter_idle(struct worker *worker)
1563 */ 1229 */
1564static void worker_leave_idle(struct worker *worker) 1230static void worker_leave_idle(struct worker *worker)
1565{ 1231{
1566 struct worker_pool *pool = worker->pool; 1232 struct global_cwq *gcwq = worker->gcwq;
1567 1233
1568 BUG_ON(!(worker->flags & WORKER_IDLE)); 1234 BUG_ON(!(worker->flags & WORKER_IDLE));
1569 worker_clr_flags(worker, WORKER_IDLE); 1235 worker_clr_flags(worker, WORKER_IDLE);
1570 pool->nr_idle--; 1236 gcwq->nr_idle--;
1571 list_del_init(&worker->entry); 1237 list_del_init(&worker->entry);
1572} 1238}
1573 1239
@@ -1587,11 +1253,11 @@ static void worker_leave_idle(struct worker *worker)
1587 * verbatim as it's best effort and blocking and gcwq may be 1253 * verbatim as it's best effort and blocking and gcwq may be
1588 * [dis]associated in the meantime. 1254 * [dis]associated in the meantime.
1589 * 1255 *
1590 * This function tries set_cpus_allowed() and locks gcwq and verifies the 1256 * This function tries set_cpus_allowed() and locks gcwq and verifies
1591 * binding against %GCWQ_DISASSOCIATED which is set during 1257 * the binding against GCWQ_DISASSOCIATED which is set during
1592 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker 1258 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1593 * enters idle state or fetches works without dropping lock, it can 1259 * idle state or fetches works without dropping lock, it can guarantee
1594 * guarantee the scheduling requirement described in the first paragraph. 1260 * the scheduling requirement described in the first paragraph.
1595 * 1261 *
1596 * CONTEXT: 1262 * CONTEXT:
1597 * Might sleep. Called without any lock but returns with gcwq->lock 1263 * Might sleep. Called without any lock but returns with gcwq->lock
@@ -1604,7 +1270,7 @@ static void worker_leave_idle(struct worker *worker)
1604static bool worker_maybe_bind_and_lock(struct worker *worker) 1270static bool worker_maybe_bind_and_lock(struct worker *worker)
1605__acquires(&gcwq->lock) 1271__acquires(&gcwq->lock)
1606{ 1272{
1607 struct global_cwq *gcwq = worker->pool->gcwq; 1273 struct global_cwq *gcwq = worker->gcwq;
1608 struct task_struct *task = worker->task; 1274 struct task_struct *task = worker->task;
1609 1275
1610 while (true) { 1276 while (true) {
@@ -1638,117 +1304,22 @@ __acquires(&gcwq->lock)
1638} 1304}
1639 1305
1640/* 1306/*
1641 * Rebind an idle @worker to its CPU. worker_thread() will test 1307 * Function for worker->rebind_work used to rebind rogue busy workers
1642 * list_empty(@worker->entry) before leaving idle and call this function. 1308 * to the associated cpu which is coming back online. This is
1643 */ 1309 * scheduled by cpu up but can race with other cpu hotplug operations
1644static void idle_worker_rebind(struct worker *worker) 1310 * and may be executed twice without intervening cpu down.
1645{
1646 struct global_cwq *gcwq = worker->pool->gcwq;
1647
1648 /* CPU may go down again inbetween, clear UNBOUND only on success */
1649 if (worker_maybe_bind_and_lock(worker))
1650 worker_clr_flags(worker, WORKER_UNBOUND);
1651
1652 /* rebind complete, become available again */
1653 list_add(&worker->entry, &worker->pool->idle_list);
1654 spin_unlock_irq(&gcwq->lock);
1655}
1656
1657/*
1658 * Function for @worker->rebind.work used to rebind unbound busy workers to
1659 * the associated cpu which is coming back online. This is scheduled by
1660 * cpu up but can race with other cpu hotplug operations and may be
1661 * executed twice without intervening cpu down.
1662 */ 1311 */
1663static void busy_worker_rebind_fn(struct work_struct *work) 1312static void worker_rebind_fn(struct work_struct *work)
1664{ 1313{
1665 struct worker *worker = container_of(work, struct worker, rebind_work); 1314 struct worker *worker = container_of(work, struct worker, rebind_work);
1666 struct global_cwq *gcwq = worker->pool->gcwq; 1315 struct global_cwq *gcwq = worker->gcwq;
1667 1316
1668 if (worker_maybe_bind_and_lock(worker)) 1317 if (worker_maybe_bind_and_lock(worker))
1669 worker_clr_flags(worker, WORKER_UNBOUND); 1318 worker_clr_flags(worker, WORKER_REBIND);
1670 1319
1671 spin_unlock_irq(&gcwq->lock); 1320 spin_unlock_irq(&gcwq->lock);
1672} 1321}
1673 1322
1674/**
1675 * rebind_workers - rebind all workers of a gcwq to the associated CPU
1676 * @gcwq: gcwq of interest
1677 *
1678 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1679 * is different for idle and busy ones.
1680 *
1681 * Idle ones will be removed from the idle_list and woken up. They will
1682 * add themselves back after completing rebind. This ensures that the
1683 * idle_list doesn't contain any unbound workers when re-bound busy workers
1684 * try to perform local wake-ups for concurrency management.
1685 *
1686 * Busy workers can rebind after they finish their current work items.
1687 * Queueing the rebind work item at the head of the scheduled list is
1688 * enough. Note that nr_running will be properly bumped as busy workers
1689 * rebind.
1690 *
1691 * On return, all non-manager workers are scheduled for rebind - see
1692 * manage_workers() for the manager special case. Any idle worker
1693 * including the manager will not appear on @idle_list until rebind is
1694 * complete, making local wake-ups safe.
1695 */
1696static void rebind_workers(struct global_cwq *gcwq)
1697{
1698 struct worker_pool *pool;
1699 struct worker *worker, *n;
1700 struct hlist_node *pos;
1701 int i;
1702
1703 lockdep_assert_held(&gcwq->lock);
1704
1705 for_each_worker_pool(pool, gcwq)
1706 lockdep_assert_held(&pool->assoc_mutex);
1707
1708 /* dequeue and kick idle ones */
1709 for_each_worker_pool(pool, gcwq) {
1710 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1711 /*
1712 * idle workers should be off @pool->idle_list
1713 * until rebind is complete to avoid receiving
1714 * premature local wake-ups.
1715 */
1716 list_del_init(&worker->entry);
1717
1718 /*
1719 * worker_thread() will see the above dequeuing
1720 * and call idle_worker_rebind().
1721 */
1722 wake_up_process(worker->task);
1723 }
1724 }
1725
1726 /* rebind busy workers */
1727 for_each_busy_worker(worker, i, pos, gcwq) {
1728 struct work_struct *rebind_work = &worker->rebind_work;
1729 struct workqueue_struct *wq;
1730
1731 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1732 work_data_bits(rebind_work)))
1733 continue;
1734
1735 debug_work_activate(rebind_work);
1736
1737 /*
1738 * wq doesn't really matter but let's keep @worker->pool
1739 * and @cwq->pool consistent for sanity.
1740 */
1741 if (worker_pool_pri(worker->pool))
1742 wq = system_highpri_wq;
1743 else
1744 wq = system_wq;
1745
1746 insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
1747 worker->scheduled.next,
1748 work_color_to_flags(WORK_NO_COLOR));
1749 }
1750}
1751
1752static struct worker *alloc_worker(void) 1323static struct worker *alloc_worker(void)
1753{ 1324{
1754 struct worker *worker; 1325 struct worker *worker;
@@ -1757,7 +1328,7 @@ static struct worker *alloc_worker(void)
1757 if (worker) { 1328 if (worker) {
1758 INIT_LIST_HEAD(&worker->entry); 1329 INIT_LIST_HEAD(&worker->entry);
1759 INIT_LIST_HEAD(&worker->scheduled); 1330 INIT_LIST_HEAD(&worker->scheduled);
1760 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); 1331 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1761 /* on creation a worker is in !idle && prep state */ 1332 /* on creation a worker is in !idle && prep state */
1762 worker->flags = WORKER_PREP; 1333 worker->flags = WORKER_PREP;
1763 } 1334 }
@@ -1766,9 +1337,10 @@ static struct worker *alloc_worker(void)
1766 1337
1767/** 1338/**
1768 * create_worker - create a new workqueue worker 1339 * create_worker - create a new workqueue worker
1769 * @pool: pool the new worker will belong to 1340 * @gcwq: gcwq the new worker will belong to
1341 * @bind: whether to set affinity to @cpu or not
1770 * 1342 *
1771 * Create a new worker which is bound to @pool. The returned worker 1343 * Create a new worker which is bound to @gcwq. The returned worker
1772 * can be started by calling start_worker() or destroyed using 1344 * can be started by calling start_worker() or destroyed using
1773 * destroy_worker(). 1345 * destroy_worker().
1774 * 1346 *
@@ -1778,17 +1350,16 @@ static struct worker *alloc_worker(void)
1778 * RETURNS: 1350 * RETURNS:
1779 * Pointer to the newly created worker. 1351 * Pointer to the newly created worker.
1780 */ 1352 */
1781static struct worker *create_worker(struct worker_pool *pool) 1353static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1782{ 1354{
1783 struct global_cwq *gcwq = pool->gcwq; 1355 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1784 const char *pri = worker_pool_pri(pool) ? "H" : "";
1785 struct worker *worker = NULL; 1356 struct worker *worker = NULL;
1786 int id = -1; 1357 int id = -1;
1787 1358
1788 spin_lock_irq(&gcwq->lock); 1359 spin_lock_irq(&gcwq->lock);
1789 while (ida_get_new(&pool->worker_ida, &id)) { 1360 while (ida_get_new(&gcwq->worker_ida, &id)) {
1790 spin_unlock_irq(&gcwq->lock); 1361 spin_unlock_irq(&gcwq->lock);
1791 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1362 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1792 goto fail; 1363 goto fail;
1793 spin_lock_irq(&gcwq->lock); 1364 spin_lock_irq(&gcwq->lock);
1794 } 1365 }
@@ -1798,43 +1369,38 @@ static struct worker *create_worker(struct worker_pool *pool)
1798 if (!worker) 1369 if (!worker)
1799 goto fail; 1370 goto fail;
1800 1371
1801 worker->pool = pool; 1372 worker->gcwq = gcwq;
1802 worker->id = id; 1373 worker->id = id;
1803 1374
1804 if (gcwq->cpu != WORK_CPU_UNBOUND) 1375 if (!on_unbound_cpu)
1805 worker->task = kthread_create_on_node(worker_thread, 1376 worker->task = kthread_create_on_node(worker_thread,
1806 worker, cpu_to_node(gcwq->cpu), 1377 worker,
1807 "kworker/%u:%d%s", gcwq->cpu, id, pri); 1378 cpu_to_node(gcwq->cpu),
1379 "kworker/%u:%d", gcwq->cpu, id);
1808 else 1380 else
1809 worker->task = kthread_create(worker_thread, worker, 1381 worker->task = kthread_create(worker_thread, worker,
1810 "kworker/u:%d%s", id, pri); 1382 "kworker/u:%d", id);
1811 if (IS_ERR(worker->task)) 1383 if (IS_ERR(worker->task))
1812 goto fail; 1384 goto fail;
1813 1385
1814 if (worker_pool_pri(pool))
1815 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1816
1817 /* 1386 /*
1818 * Determine CPU binding of the new worker depending on 1387 * A rogue worker will become a regular one if CPU comes
1819 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the 1388 * online later on. Make sure every worker has
1820 * flag remains stable across this function. See the comments 1389 * PF_THREAD_BOUND set.
1821 * above the flag definition for details.
1822 *
1823 * As an unbound worker may later become a regular one if CPU comes
1824 * online, make sure every worker has %PF_THREAD_BOUND set.
1825 */ 1390 */
1826 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { 1391 if (bind && !on_unbound_cpu)
1827 kthread_bind(worker->task, gcwq->cpu); 1392 kthread_bind(worker->task, gcwq->cpu);
1828 } else { 1393 else {
1829 worker->task->flags |= PF_THREAD_BOUND; 1394 worker->task->flags |= PF_THREAD_BOUND;
1830 worker->flags |= WORKER_UNBOUND; 1395 if (on_unbound_cpu)
1396 worker->flags |= WORKER_UNBOUND;
1831 } 1397 }
1832 1398
1833 return worker; 1399 return worker;
1834fail: 1400fail:
1835 if (id >= 0) { 1401 if (id >= 0) {
1836 spin_lock_irq(&gcwq->lock); 1402 spin_lock_irq(&gcwq->lock);
1837 ida_remove(&pool->worker_ida, id); 1403 ida_remove(&gcwq->worker_ida, id);
1838 spin_unlock_irq(&gcwq->lock); 1404 spin_unlock_irq(&gcwq->lock);
1839 } 1405 }
1840 kfree(worker); 1406 kfree(worker);
@@ -1853,7 +1419,7 @@ fail:
1853static void start_worker(struct worker *worker) 1419static void start_worker(struct worker *worker)
1854{ 1420{
1855 worker->flags |= WORKER_STARTED; 1421 worker->flags |= WORKER_STARTED;
1856 worker->pool->nr_workers++; 1422 worker->gcwq->nr_workers++;
1857 worker_enter_idle(worker); 1423 worker_enter_idle(worker);
1858 wake_up_process(worker->task); 1424 wake_up_process(worker->task);
1859} 1425}
@@ -1869,8 +1435,7 @@ static void start_worker(struct worker *worker)
1869 */ 1435 */
1870static void destroy_worker(struct worker *worker) 1436static void destroy_worker(struct worker *worker)
1871{ 1437{
1872 struct worker_pool *pool = worker->pool; 1438 struct global_cwq *gcwq = worker->gcwq;
1873 struct global_cwq *gcwq = pool->gcwq;
1874 int id = worker->id; 1439 int id = worker->id;
1875 1440
1876 /* sanity check frenzy */ 1441 /* sanity check frenzy */
@@ -1878,9 +1443,9 @@ static void destroy_worker(struct worker *worker)
1878 BUG_ON(!list_empty(&worker->scheduled)); 1443 BUG_ON(!list_empty(&worker->scheduled));
1879 1444
1880 if (worker->flags & WORKER_STARTED) 1445 if (worker->flags & WORKER_STARTED)
1881 pool->nr_workers--; 1446 gcwq->nr_workers--;
1882 if (worker->flags & WORKER_IDLE) 1447 if (worker->flags & WORKER_IDLE)
1883 pool->nr_idle--; 1448 gcwq->nr_idle--;
1884 1449
1885 list_del_init(&worker->entry); 1450 list_del_init(&worker->entry);
1886 worker->flags |= WORKER_DIE; 1451 worker->flags |= WORKER_DIE;
@@ -1891,30 +1456,29 @@ static void destroy_worker(struct worker *worker)
1891 kfree(worker); 1456 kfree(worker);
1892 1457
1893 spin_lock_irq(&gcwq->lock); 1458 spin_lock_irq(&gcwq->lock);
1894 ida_remove(&pool->worker_ida, id); 1459 ida_remove(&gcwq->worker_ida, id);
1895} 1460}
1896 1461
1897static void idle_worker_timeout(unsigned long __pool) 1462static void idle_worker_timeout(unsigned long __gcwq)
1898{ 1463{
1899 struct worker_pool *pool = (void *)__pool; 1464 struct global_cwq *gcwq = (void *)__gcwq;
1900 struct global_cwq *gcwq = pool->gcwq;
1901 1465
1902 spin_lock_irq(&gcwq->lock); 1466 spin_lock_irq(&gcwq->lock);
1903 1467
1904 if (too_many_workers(pool)) { 1468 if (too_many_workers(gcwq)) {
1905 struct worker *worker; 1469 struct worker *worker;
1906 unsigned long expires; 1470 unsigned long expires;
1907 1471
1908 /* idle_list is kept in LIFO order, check the last one */ 1472 /* idle_list is kept in LIFO order, check the last one */
1909 worker = list_entry(pool->idle_list.prev, struct worker, entry); 1473 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1910 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1474 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1911 1475
1912 if (time_before(jiffies, expires)) 1476 if (time_before(jiffies, expires))
1913 mod_timer(&pool->idle_timer, expires); 1477 mod_timer(&gcwq->idle_timer, expires);
1914 else { 1478 else {
1915 /* it's been idle for too long, wake up manager */ 1479 /* it's been idle for too long, wake up manager */
1916 pool->flags |= POOL_MANAGE_WORKERS; 1480 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1917 wake_up_worker(pool); 1481 wake_up_worker(gcwq);
1918 } 1482 }
1919 } 1483 }
1920 1484
@@ -1931,7 +1495,7 @@ static bool send_mayday(struct work_struct *work)
1931 return false; 1495 return false;
1932 1496
1933 /* mayday mayday mayday */ 1497 /* mayday mayday mayday */
1934 cpu = cwq->pool->gcwq->cpu; 1498 cpu = cwq->gcwq->cpu;
1935 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1499 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1936 if (cpu == WORK_CPU_UNBOUND) 1500 if (cpu == WORK_CPU_UNBOUND)
1937 cpu = 0; 1501 cpu = 0;
@@ -1940,38 +1504,37 @@ static bool send_mayday(struct work_struct *work)
1940 return true; 1504 return true;
1941} 1505}
1942 1506
1943static void gcwq_mayday_timeout(unsigned long __pool) 1507static void gcwq_mayday_timeout(unsigned long __gcwq)
1944{ 1508{
1945 struct worker_pool *pool = (void *)__pool; 1509 struct global_cwq *gcwq = (void *)__gcwq;
1946 struct global_cwq *gcwq = pool->gcwq;
1947 struct work_struct *work; 1510 struct work_struct *work;
1948 1511
1949 spin_lock_irq(&gcwq->lock); 1512 spin_lock_irq(&gcwq->lock);
1950 1513
1951 if (need_to_create_worker(pool)) { 1514 if (need_to_create_worker(gcwq)) {
1952 /* 1515 /*
1953 * We've been trying to create a new worker but 1516 * We've been trying to create a new worker but
1954 * haven't been successful. We might be hitting an 1517 * haven't been successful. We might be hitting an
1955 * allocation deadlock. Send distress signals to 1518 * allocation deadlock. Send distress signals to
1956 * rescuers. 1519 * rescuers.
1957 */ 1520 */
1958 list_for_each_entry(work, &pool->worklist, entry) 1521 list_for_each_entry(work, &gcwq->worklist, entry)
1959 send_mayday(work); 1522 send_mayday(work);
1960 } 1523 }
1961 1524
1962 spin_unlock_irq(&gcwq->lock); 1525 spin_unlock_irq(&gcwq->lock);
1963 1526
1964 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1527 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1965} 1528}
1966 1529
1967/** 1530/**
1968 * maybe_create_worker - create a new worker if necessary 1531 * maybe_create_worker - create a new worker if necessary
1969 * @pool: pool to create a new worker for 1532 * @gcwq: gcwq to create a new worker for
1970 * 1533 *
1971 * Create a new worker for @pool if necessary. @pool is guaranteed to 1534 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1972 * have at least one idle worker on return from this function. If 1535 * have at least one idle worker on return from this function. If
1973 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 1536 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1974 * sent to all rescuers with works scheduled on @pool to resolve 1537 * sent to all rescuers with works scheduled on @gcwq to resolve
1975 * possible allocation deadlock. 1538 * possible allocation deadlock.
1976 * 1539 *
1977 * On return, need_to_create_worker() is guaranteed to be false and 1540 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1986,54 +1549,52 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1986 * false if no action was taken and gcwq->lock stayed locked, true 1549 * false if no action was taken and gcwq->lock stayed locked, true
1987 * otherwise. 1550 * otherwise.
1988 */ 1551 */
1989static bool maybe_create_worker(struct worker_pool *pool) 1552static bool maybe_create_worker(struct global_cwq *gcwq)
1990__releases(&gcwq->lock) 1553__releases(&gcwq->lock)
1991__acquires(&gcwq->lock) 1554__acquires(&gcwq->lock)
1992{ 1555{
1993 struct global_cwq *gcwq = pool->gcwq; 1556 if (!need_to_create_worker(gcwq))
1994
1995 if (!need_to_create_worker(pool))
1996 return false; 1557 return false;
1997restart: 1558restart:
1998 spin_unlock_irq(&gcwq->lock); 1559 spin_unlock_irq(&gcwq->lock);
1999 1560
2000 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1561 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
2001 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1562 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
2002 1563
2003 while (true) { 1564 while (true) {
2004 struct worker *worker; 1565 struct worker *worker;
2005 1566
2006 worker = create_worker(pool); 1567 worker = create_worker(gcwq, true);
2007 if (worker) { 1568 if (worker) {
2008 del_timer_sync(&pool->mayday_timer); 1569 del_timer_sync(&gcwq->mayday_timer);
2009 spin_lock_irq(&gcwq->lock); 1570 spin_lock_irq(&gcwq->lock);
2010 start_worker(worker); 1571 start_worker(worker);
2011 BUG_ON(need_to_create_worker(pool)); 1572 BUG_ON(need_to_create_worker(gcwq));
2012 return true; 1573 return true;
2013 } 1574 }
2014 1575
2015 if (!need_to_create_worker(pool)) 1576 if (!need_to_create_worker(gcwq))
2016 break; 1577 break;
2017 1578
2018 __set_current_state(TASK_INTERRUPTIBLE); 1579 __set_current_state(TASK_INTERRUPTIBLE);
2019 schedule_timeout(CREATE_COOLDOWN); 1580 schedule_timeout(CREATE_COOLDOWN);
2020 1581
2021 if (!need_to_create_worker(pool)) 1582 if (!need_to_create_worker(gcwq))
2022 break; 1583 break;
2023 } 1584 }
2024 1585
2025 del_timer_sync(&pool->mayday_timer); 1586 del_timer_sync(&gcwq->mayday_timer);
2026 spin_lock_irq(&gcwq->lock); 1587 spin_lock_irq(&gcwq->lock);
2027 if (need_to_create_worker(pool)) 1588 if (need_to_create_worker(gcwq))
2028 goto restart; 1589 goto restart;
2029 return true; 1590 return true;
2030} 1591}
2031 1592
2032/** 1593/**
2033 * maybe_destroy_worker - destroy workers which have been idle for a while 1594 * maybe_destroy_worker - destroy workers which have been idle for a while
2034 * @pool: pool to destroy workers for 1595 * @gcwq: gcwq to destroy workers for
2035 * 1596 *
2036 * Destroy @pool workers which have been idle for longer than 1597 * Destroy @gcwq workers which have been idle for longer than
2037 * IDLE_WORKER_TIMEOUT. 1598 * IDLE_WORKER_TIMEOUT.
2038 * 1599 *
2039 * LOCKING: 1600 * LOCKING:
@@ -2044,19 +1605,19 @@ restart:
2044 * false if no action was taken and gcwq->lock stayed locked, true 1605 * false if no action was taken and gcwq->lock stayed locked, true
2045 * otherwise. 1606 * otherwise.
2046 */ 1607 */
2047static bool maybe_destroy_workers(struct worker_pool *pool) 1608static bool maybe_destroy_workers(struct global_cwq *gcwq)
2048{ 1609{
2049 bool ret = false; 1610 bool ret = false;
2050 1611
2051 while (too_many_workers(pool)) { 1612 while (too_many_workers(gcwq)) {
2052 struct worker *worker; 1613 struct worker *worker;
2053 unsigned long expires; 1614 unsigned long expires;
2054 1615
2055 worker = list_entry(pool->idle_list.prev, struct worker, entry); 1616 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
2056 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1617 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2057 1618
2058 if (time_before(jiffies, expires)) { 1619 if (time_before(jiffies, expires)) {
2059 mod_timer(&pool->idle_timer, expires); 1620 mod_timer(&gcwq->idle_timer, expires);
2060 break; 1621 break;
2061 } 1622 }
2062 1623
@@ -2089,63 +1650,137 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2089 */ 1650 */
2090static bool manage_workers(struct worker *worker) 1651static bool manage_workers(struct worker *worker)
2091{ 1652{
2092 struct worker_pool *pool = worker->pool; 1653 struct global_cwq *gcwq = worker->gcwq;
2093 bool ret = false; 1654 bool ret = false;
2094 1655
2095 if (pool->flags & POOL_MANAGING_WORKERS) 1656 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
2096 return ret; 1657 return ret;
2097 1658
2098 pool->flags |= POOL_MANAGING_WORKERS; 1659 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1660 gcwq->flags |= GCWQ_MANAGING_WORKERS;
2099 1661
2100 /* 1662 /*
2101 * To simplify both worker management and CPU hotplug, hold off 1663 * Destroy and then create so that may_start_working() is true
2102 * management while hotplug is in progress. CPU hotplug path can't 1664 * on return.
2103 * grab %POOL_MANAGING_WORKERS to achieve this because that can
2104 * lead to idle worker depletion (all become busy thinking someone
2105 * else is managing) which in turn can result in deadlock under
2106 * extreme circumstances. Use @pool->assoc_mutex to synchronize
2107 * manager against CPU hotplug.
2108 *
2109 * assoc_mutex would always be free unless CPU hotplug is in
2110 * progress. trylock first without dropping @gcwq->lock.
2111 */ 1665 */
2112 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 1666 ret |= maybe_destroy_workers(gcwq);
2113 spin_unlock_irq(&pool->gcwq->lock); 1667 ret |= maybe_create_worker(gcwq);
2114 mutex_lock(&pool->assoc_mutex);
2115 /*
2116 * CPU hotplug could have happened while we were waiting
2117 * for assoc_mutex. Hotplug itself can't handle us
2118 * because manager isn't either on idle or busy list, and
2119 * @gcwq's state and ours could have deviated.
2120 *
2121 * As hotplug is now excluded via assoc_mutex, we can
2122 * simply try to bind. It will succeed or fail depending
2123 * on @gcwq's current state. Try it and adjust
2124 * %WORKER_UNBOUND accordingly.
2125 */
2126 if (worker_maybe_bind_and_lock(worker))
2127 worker->flags &= ~WORKER_UNBOUND;
2128 else
2129 worker->flags |= WORKER_UNBOUND;
2130 1668
2131 ret = true; 1669 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
2132 }
2133
2134 pool->flags &= ~POOL_MANAGE_WORKERS;
2135 1670
2136 /* 1671 /*
2137 * Destroy and then create so that may_start_working() is true 1672 * The trustee might be waiting to take over the manager
2138 * on return. 1673 * position, tell it we're done.
2139 */ 1674 */
2140 ret |= maybe_destroy_workers(pool); 1675 if (unlikely(gcwq->trustee))
2141 ret |= maybe_create_worker(pool); 1676 wake_up_all(&gcwq->trustee_wait);
2142 1677
2143 pool->flags &= ~POOL_MANAGING_WORKERS;
2144 mutex_unlock(&pool->assoc_mutex);
2145 return ret; 1678 return ret;
2146} 1679}
2147 1680
2148/** 1681/**
1682 * move_linked_works - move linked works to a list
1683 * @work: start of series of works to be scheduled
1684 * @head: target list to append @work to
1685 * @nextp: out paramter for nested worklist walking
1686 *
1687 * Schedule linked works starting from @work to @head. Work series to
1688 * be scheduled starts at @work and includes any consecutive work with
1689 * WORK_STRUCT_LINKED set in its predecessor.
1690 *
1691 * If @nextp is not NULL, it's updated to point to the next work of
1692 * the last scheduled work. This allows move_linked_works() to be
1693 * nested inside outer list_for_each_entry_safe().
1694 *
1695 * CONTEXT:
1696 * spin_lock_irq(gcwq->lock).
1697 */
1698static void move_linked_works(struct work_struct *work, struct list_head *head,
1699 struct work_struct **nextp)
1700{
1701 struct work_struct *n;
1702
1703 /*
1704 * Linked worklist will always end before the end of the list,
1705 * use NULL for list head.
1706 */
1707 list_for_each_entry_safe_from(work, n, NULL, entry) {
1708 list_move_tail(&work->entry, head);
1709 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1710 break;
1711 }
1712
1713 /*
1714 * If we're already inside safe list traversal and have moved
1715 * multiple works to the scheduled queue, the next position
1716 * needs to be updated.
1717 */
1718 if (nextp)
1719 *nextp = n;
1720}
1721
1722static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1723{
1724 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1725 struct work_struct, entry);
1726 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1727
1728 trace_workqueue_activate_work(work);
1729 move_linked_works(work, pos, NULL);
1730 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1731 cwq->nr_active++;
1732}
1733
1734/**
1735 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1736 * @cwq: cwq of interest
1737 * @color: color of work which left the queue
1738 * @delayed: for a delayed work
1739 *
1740 * A work either has completed or is removed from pending queue,
1741 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1742 *
1743 * CONTEXT:
1744 * spin_lock_irq(gcwq->lock).
1745 */
1746static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1747 bool delayed)
1748{
1749 /* ignore uncolored works */
1750 if (color == WORK_NO_COLOR)
1751 return;
1752
1753 cwq->nr_in_flight[color]--;
1754
1755 if (!delayed) {
1756 cwq->nr_active--;
1757 if (!list_empty(&cwq->delayed_works)) {
1758 /* one down, submit a delayed one */
1759 if (cwq->nr_active < cwq->max_active)
1760 cwq_activate_first_delayed(cwq);
1761 }
1762 }
1763
1764 /* is flush in progress and are we at the flushing tip? */
1765 if (likely(cwq->flush_color != color))
1766 return;
1767
1768 /* are there still in-flight works? */
1769 if (cwq->nr_in_flight[color])
1770 return;
1771
1772 /* this cwq is done, clear flush_color */
1773 cwq->flush_color = -1;
1774
1775 /*
1776 * If this was the last cwq, wake up the first flusher. It
1777 * will handle the rest.
1778 */
1779 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1780 complete(&cwq->wq->first_flusher->done);
1781}
1782
1783/**
2149 * process_one_work - process single work 1784 * process_one_work - process single work
2150 * @worker: self 1785 * @worker: self
2151 * @work: work to process 1786 * @work: work to process
@@ -2164,8 +1799,7 @@ __releases(&gcwq->lock)
2164__acquires(&gcwq->lock) 1799__acquires(&gcwq->lock)
2165{ 1800{
2166 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1801 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
2167 struct worker_pool *pool = worker->pool; 1802 struct global_cwq *gcwq = cwq->gcwq;
2168 struct global_cwq *gcwq = pool->gcwq;
2169 struct hlist_head *bwh = busy_worker_head(gcwq, work); 1803 struct hlist_head *bwh = busy_worker_head(gcwq, work);
2170 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; 1804 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2171 work_func_t f = work->func; 1805 work_func_t f = work->func;
@@ -2179,20 +1813,9 @@ __acquires(&gcwq->lock)
2179 * lock freed" warnings as well as problems when looking into 1813 * lock freed" warnings as well as problems when looking into
2180 * work->lockdep_map, make a copy and use that here. 1814 * work->lockdep_map, make a copy and use that here.
2181 */ 1815 */
2182 struct lockdep_map lockdep_map; 1816 struct lockdep_map lockdep_map = work->lockdep_map;
2183
2184 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2185#endif 1817#endif
2186 /* 1818 /*
2187 * Ensure we're on the correct CPU. DISASSOCIATED test is
2188 * necessary to avoid spurious warnings from rescuers servicing the
2189 * unbound or a disassociated gcwq.
2190 */
2191 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2192 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
2193 raw_smp_processor_id() != gcwq->cpu);
2194
2195 /*
2196 * A single work shouldn't be executed concurrently by 1819 * A single work shouldn't be executed concurrently by
2197 * multiple workers on a single cpu. Check whether anyone is 1820 * multiple workers on a single cpu. Check whether anyone is
2198 * already processing the work. If so, defer the work to the 1821 * already processing the work. If so, defer the work to the
@@ -2204,39 +1827,42 @@ __acquires(&gcwq->lock)
2204 return; 1827 return;
2205 } 1828 }
2206 1829
2207 /* claim and dequeue */ 1830 /* claim and process */
2208 debug_work_deactivate(work); 1831 debug_work_deactivate(work);
2209 hlist_add_head(&worker->hentry, bwh); 1832 hlist_add_head(&worker->hentry, bwh);
2210 worker->current_work = work; 1833 worker->current_work = work;
2211 worker->current_cwq = cwq; 1834 worker->current_cwq = cwq;
2212 work_color = get_work_color(work); 1835 work_color = get_work_color(work);
2213 1836
1837 /* record the current cpu number in the work data and dequeue */
1838 set_work_cpu(work, gcwq->cpu);
2214 list_del_init(&work->entry); 1839 list_del_init(&work->entry);
2215 1840
2216 /* 1841 /*
2217 * CPU intensive works don't participate in concurrency 1842 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
2218 * management. They're the scheduler's responsibility. 1843 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
2219 */ 1844 */
2220 if (unlikely(cpu_intensive)) 1845 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
2221 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 1846 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1847 struct work_struct, entry);
2222 1848
2223 /* 1849 if (!list_empty(&gcwq->worklist) &&
2224 * Unbound gcwq isn't concurrency managed and work items should be 1850 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
2225 * executed ASAP. Wake up another worker if necessary. 1851 wake_up_worker(gcwq);
2226 */ 1852 else
2227 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 1853 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
2228 wake_up_worker(pool); 1854 }
2229 1855
2230 /* 1856 /*
2231 * Record the last CPU and clear PENDING which should be the last 1857 * CPU intensive works don't participate in concurrency
2232 * update to @work. Also, do this inside @gcwq->lock so that 1858 * management. They're the scheduler's responsibility.
2233 * PENDING and queued state changes happen together while IRQ is
2234 * disabled.
2235 */ 1859 */
2236 set_work_cpu_and_clear_pending(work, gcwq->cpu); 1860 if (unlikely(cpu_intensive))
1861 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2237 1862
2238 spin_unlock_irq(&gcwq->lock); 1863 spin_unlock_irq(&gcwq->lock);
2239 1864
1865 work_clear_pending(work);
2240 lock_map_acquire_read(&cwq->wq->lockdep_map); 1866 lock_map_acquire_read(&cwq->wq->lockdep_map);
2241 lock_map_acquire(&lockdep_map); 1867 lock_map_acquire(&lockdep_map);
2242 trace_workqueue_execute_start(work); 1868 trace_workqueue_execute_start(work);
@@ -2250,9 +1876,11 @@ __acquires(&gcwq->lock)
2250 lock_map_release(&cwq->wq->lockdep_map); 1876 lock_map_release(&cwq->wq->lockdep_map);
2251 1877
2252 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1878 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2253 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 1879 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
2254 " last function: %pf\n", 1880 "%s/0x%08x/%d\n",
2255 current->comm, preempt_count(), task_pid_nr(current), f); 1881 current->comm, preempt_count(), task_pid_nr(current));
1882 printk(KERN_ERR " last function: ");
1883 print_symbol("%s\n", (unsigned long)f);
2256 debug_show_held_locks(current); 1884 debug_show_held_locks(current);
2257 dump_stack(); 1885 dump_stack();
2258 } 1886 }
@@ -2267,7 +1895,7 @@ __acquires(&gcwq->lock)
2267 hlist_del_init(&worker->hentry); 1895 hlist_del_init(&worker->hentry);
2268 worker->current_work = NULL; 1896 worker->current_work = NULL;
2269 worker->current_cwq = NULL; 1897 worker->current_cwq = NULL;
2270 cwq_dec_nr_in_flight(cwq, work_color); 1898 cwq_dec_nr_in_flight(cwq, work_color, false);
2271} 1899}
2272 1900
2273/** 1901/**
@@ -2304,37 +1932,28 @@ static void process_scheduled_works(struct worker *worker)
2304static int worker_thread(void *__worker) 1932static int worker_thread(void *__worker)
2305{ 1933{
2306 struct worker *worker = __worker; 1934 struct worker *worker = __worker;
2307 struct worker_pool *pool = worker->pool; 1935 struct global_cwq *gcwq = worker->gcwq;
2308 struct global_cwq *gcwq = pool->gcwq;
2309 1936
2310 /* tell the scheduler that this is a workqueue worker */ 1937 /* tell the scheduler that this is a workqueue worker */
2311 worker->task->flags |= PF_WQ_WORKER; 1938 worker->task->flags |= PF_WQ_WORKER;
2312woke_up: 1939woke_up:
2313 spin_lock_irq(&gcwq->lock); 1940 spin_lock_irq(&gcwq->lock);
2314 1941
2315 /* we are off idle list if destruction or rebind is requested */ 1942 /* DIE can be set only while we're idle, checking here is enough */
2316 if (unlikely(list_empty(&worker->entry))) { 1943 if (worker->flags & WORKER_DIE) {
2317 spin_unlock_irq(&gcwq->lock); 1944 spin_unlock_irq(&gcwq->lock);
2318 1945 worker->task->flags &= ~PF_WQ_WORKER;
2319 /* if DIE is set, destruction is requested */ 1946 return 0;
2320 if (worker->flags & WORKER_DIE) {
2321 worker->task->flags &= ~PF_WQ_WORKER;
2322 return 0;
2323 }
2324
2325 /* otherwise, rebind */
2326 idle_worker_rebind(worker);
2327 goto woke_up;
2328 } 1947 }
2329 1948
2330 worker_leave_idle(worker); 1949 worker_leave_idle(worker);
2331recheck: 1950recheck:
2332 /* no more worker necessary? */ 1951 /* no more worker necessary? */
2333 if (!need_more_worker(pool)) 1952 if (!need_more_worker(gcwq))
2334 goto sleep; 1953 goto sleep;
2335 1954
2336 /* do we need to manage? */ 1955 /* do we need to manage? */
2337 if (unlikely(!may_start_working(pool)) && manage_workers(worker)) 1956 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
2338 goto recheck; 1957 goto recheck;
2339 1958
2340 /* 1959 /*
@@ -2353,7 +1972,7 @@ recheck:
2353 1972
2354 do { 1973 do {
2355 struct work_struct *work = 1974 struct work_struct *work =
2356 list_first_entry(&pool->worklist, 1975 list_first_entry(&gcwq->worklist,
2357 struct work_struct, entry); 1976 struct work_struct, entry);
2358 1977
2359 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { 1978 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -2365,11 +1984,11 @@ recheck:
2365 move_linked_works(work, &worker->scheduled, NULL); 1984 move_linked_works(work, &worker->scheduled, NULL);
2366 process_scheduled_works(worker); 1985 process_scheduled_works(worker);
2367 } 1986 }
2368 } while (keep_working(pool)); 1987 } while (keep_working(gcwq));
2369 1988
2370 worker_set_flags(worker, WORKER_PREP, false); 1989 worker_set_flags(worker, WORKER_PREP, false);
2371sleep: 1990sleep:
2372 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) 1991 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
2373 goto recheck; 1992 goto recheck;
2374 1993
2375 /* 1994 /*
@@ -2417,10 +2036,8 @@ static int rescuer_thread(void *__wq)
2417repeat: 2036repeat:
2418 set_current_state(TASK_INTERRUPTIBLE); 2037 set_current_state(TASK_INTERRUPTIBLE);
2419 2038
2420 if (kthread_should_stop()) { 2039 if (kthread_should_stop())
2421 __set_current_state(TASK_RUNNING);
2422 return 0; 2040 return 0;
2423 }
2424 2041
2425 /* 2042 /*
2426 * See whether any cpu is asking for help. Unbounded 2043 * See whether any cpu is asking for help. Unbounded
@@ -2429,15 +2046,14 @@ repeat:
2429 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2046 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2430 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2047 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2431 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2048 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2432 struct worker_pool *pool = cwq->pool; 2049 struct global_cwq *gcwq = cwq->gcwq;
2433 struct global_cwq *gcwq = pool->gcwq;
2434 struct work_struct *work, *n; 2050 struct work_struct *work, *n;
2435 2051
2436 __set_current_state(TASK_RUNNING); 2052 __set_current_state(TASK_RUNNING);
2437 mayday_clear_cpu(cpu, wq->mayday_mask); 2053 mayday_clear_cpu(cpu, wq->mayday_mask);
2438 2054
2439 /* migrate to the target cpu if possible */ 2055 /* migrate to the target cpu if possible */
2440 rescuer->pool = pool; 2056 rescuer->gcwq = gcwq;
2441 worker_maybe_bind_and_lock(rescuer); 2057 worker_maybe_bind_and_lock(rescuer);
2442 2058
2443 /* 2059 /*
@@ -2445,7 +2061,7 @@ repeat:
2445 * process'em. 2061 * process'em.
2446 */ 2062 */
2447 BUG_ON(!list_empty(&rescuer->scheduled)); 2063 BUG_ON(!list_empty(&rescuer->scheduled));
2448 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2064 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2449 if (get_work_cwq(work) == cwq) 2065 if (get_work_cwq(work) == cwq)
2450 move_linked_works(work, scheduled, &n); 2066 move_linked_works(work, scheduled, &n);
2451 2067
@@ -2456,8 +2072,8 @@ repeat:
2456 * regular worker; otherwise, we end up with 0 concurrency 2072 * regular worker; otherwise, we end up with 0 concurrency
2457 * and stalling the execution. 2073 * and stalling the execution.
2458 */ 2074 */
2459 if (keep_working(pool)) 2075 if (keep_working(gcwq))
2460 wake_up_worker(pool); 2076 wake_up_worker(gcwq);
2461 2077
2462 spin_unlock_irq(&gcwq->lock); 2078 spin_unlock_irq(&gcwq->lock);
2463 } 2079 }
@@ -2582,7 +2198,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2582 2198
2583 for_each_cwq_cpu(cpu, wq) { 2199 for_each_cwq_cpu(cpu, wq) {
2584 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2200 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2585 struct global_cwq *gcwq = cwq->pool->gcwq; 2201 struct global_cwq *gcwq = cwq->gcwq;
2586 2202
2587 spin_lock_irq(&gcwq->lock); 2203 spin_lock_irq(&gcwq->lock);
2588 2204
@@ -2798,17 +2414,17 @@ reflush:
2798 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2799 bool drained; 2415 bool drained;
2800 2416
2801 spin_lock_irq(&cwq->pool->gcwq->lock); 2417 spin_lock_irq(&cwq->gcwq->lock);
2802 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2418 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2803 spin_unlock_irq(&cwq->pool->gcwq->lock); 2419 spin_unlock_irq(&cwq->gcwq->lock);
2804 2420
2805 if (drained) 2421 if (drained)
2806 continue; 2422 continue;
2807 2423
2808 if (++flush_cnt == 10 || 2424 if (++flush_cnt == 10 ||
2809 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2425 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2810 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", 2426 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2811 wq->name, flush_cnt); 2427 wq->name, flush_cnt);
2812 goto reflush; 2428 goto reflush;
2813 } 2429 }
2814 2430
@@ -2819,7 +2435,8 @@ reflush:
2819} 2435}
2820EXPORT_SYMBOL_GPL(drain_workqueue); 2436EXPORT_SYMBOL_GPL(drain_workqueue);
2821 2437
2822static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) 2438static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2439 bool wait_executing)
2823{ 2440{
2824 struct worker *worker = NULL; 2441 struct worker *worker = NULL;
2825 struct global_cwq *gcwq; 2442 struct global_cwq *gcwq;
@@ -2839,14 +2456,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2839 */ 2456 */
2840 smp_rmb(); 2457 smp_rmb();
2841 cwq = get_work_cwq(work); 2458 cwq = get_work_cwq(work);
2842 if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) 2459 if (unlikely(!cwq || gcwq != cwq->gcwq))
2843 goto already_gone; 2460 goto already_gone;
2844 } else { 2461 } else if (wait_executing) {
2845 worker = find_worker_executing_work(gcwq, work); 2462 worker = find_worker_executing_work(gcwq, work);
2846 if (!worker) 2463 if (!worker)
2847 goto already_gone; 2464 goto already_gone;
2848 cwq = worker->current_cwq; 2465 cwq = worker->current_cwq;
2849 } 2466 } else
2467 goto already_gone;
2850 2468
2851 insert_wq_barrier(cwq, barr, work, worker); 2469 insert_wq_barrier(cwq, barr, work, worker);
2852 spin_unlock_irq(&gcwq->lock); 2470 spin_unlock_irq(&gcwq->lock);
@@ -2873,8 +2491,15 @@ already_gone:
2873 * flush_work - wait for a work to finish executing the last queueing instance 2491 * flush_work - wait for a work to finish executing the last queueing instance
2874 * @work: the work to flush 2492 * @work: the work to flush
2875 * 2493 *
2876 * Wait until @work has finished execution. @work is guaranteed to be idle 2494 * Wait until @work has finished execution. This function considers
2877 * on return if it hasn't been requeued since flush started. 2495 * only the last queueing instance of @work. If @work has been
2496 * enqueued across different CPUs on a non-reentrant workqueue or on
2497 * multiple workqueues, @work might still be executing on return on
2498 * some of the CPUs from earlier queueing.
2499 *
2500 * If @work was queued only on a non-reentrant, ordered or unbound
2501 * workqueue, @work is guaranteed to be idle on return if it hasn't
2502 * been requeued since flush started.
2878 * 2503 *
2879 * RETURNS: 2504 * RETURNS:
2880 * %true if flush_work() waited for the work to finish execution, 2505 * %true if flush_work() waited for the work to finish execution,
@@ -2884,39 +2509,140 @@ bool flush_work(struct work_struct *work)
2884{ 2509{
2885 struct wq_barrier barr; 2510 struct wq_barrier barr;
2886 2511
2887 lock_map_acquire(&work->lockdep_map); 2512 if (start_flush_work(work, &barr, true)) {
2888 lock_map_release(&work->lockdep_map); 2513 wait_for_completion(&barr.done);
2514 destroy_work_on_stack(&barr.work);
2515 return true;
2516 } else
2517 return false;
2518}
2519EXPORT_SYMBOL_GPL(flush_work);
2520
2521static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2522{
2523 struct wq_barrier barr;
2524 struct worker *worker;
2889 2525
2890 if (start_flush_work(work, &barr)) { 2526 spin_lock_irq(&gcwq->lock);
2527
2528 worker = find_worker_executing_work(gcwq, work);
2529 if (unlikely(worker))
2530 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2531
2532 spin_unlock_irq(&gcwq->lock);
2533
2534 if (unlikely(worker)) {
2891 wait_for_completion(&barr.done); 2535 wait_for_completion(&barr.done);
2892 destroy_work_on_stack(&barr.work); 2536 destroy_work_on_stack(&barr.work);
2893 return true; 2537 return true;
2894 } else { 2538 } else
2895 return false; 2539 return false;
2540}
2541
2542static bool wait_on_work(struct work_struct *work)
2543{
2544 bool ret = false;
2545 int cpu;
2546
2547 might_sleep();
2548
2549 lock_map_acquire(&work->lockdep_map);
2550 lock_map_release(&work->lockdep_map);
2551
2552 for_each_gcwq_cpu(cpu)
2553 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2554 return ret;
2555}
2556
2557/**
2558 * flush_work_sync - wait until a work has finished execution
2559 * @work: the work to flush
2560 *
2561 * Wait until @work has finished execution. On return, it's
2562 * guaranteed that all queueing instances of @work which happened
2563 * before this function is called are finished. In other words, if
2564 * @work hasn't been requeued since this function was called, @work is
2565 * guaranteed to be idle on return.
2566 *
2567 * RETURNS:
2568 * %true if flush_work_sync() waited for the work to finish execution,
2569 * %false if it was already idle.
2570 */
2571bool flush_work_sync(struct work_struct *work)
2572{
2573 struct wq_barrier barr;
2574 bool pending, waited;
2575
2576 /* we'll wait for executions separately, queue barr only if pending */
2577 pending = start_flush_work(work, &barr, false);
2578
2579 /* wait for executions to finish */
2580 waited = wait_on_work(work);
2581
2582 /* wait for the pending one */
2583 if (pending) {
2584 wait_for_completion(&barr.done);
2585 destroy_work_on_stack(&barr.work);
2896 } 2586 }
2587
2588 return pending || waited;
2897} 2589}
2898EXPORT_SYMBOL_GPL(flush_work); 2590EXPORT_SYMBOL_GPL(flush_work_sync);
2899 2591
2900static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) 2592/*
2593 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2594 * so this work can't be re-armed in any way.
2595 */
2596static int try_to_grab_pending(struct work_struct *work)
2901{ 2597{
2902 unsigned long flags; 2598 struct global_cwq *gcwq;
2903 int ret; 2599 int ret = -1;
2904 2600
2905 do { 2601 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2906 ret = try_to_grab_pending(work, is_dwork, &flags); 2602 return 0;
2603
2604 /*
2605 * The queueing is in progress, or it is already queued. Try to
2606 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2607 */
2608 gcwq = get_work_gcwq(work);
2609 if (!gcwq)
2610 return ret;
2611
2612 spin_lock_irq(&gcwq->lock);
2613 if (!list_empty(&work->entry)) {
2907 /* 2614 /*
2908 * If someone else is canceling, wait for the same event it 2615 * This work is queued, but perhaps we locked the wrong gcwq.
2909 * would be waiting for before retrying. 2616 * In that case we must see the new value after rmb(), see
2617 * insert_work()->wmb().
2910 */ 2618 */
2911 if (unlikely(ret == -ENOENT)) 2619 smp_rmb();
2912 flush_work(work); 2620 if (gcwq == get_work_gcwq(work)) {
2913 } while (unlikely(ret < 0)); 2621 debug_work_deactivate(work);
2622 list_del_init(&work->entry);
2623 cwq_dec_nr_in_flight(get_work_cwq(work),
2624 get_work_color(work),
2625 *work_data_bits(work) & WORK_STRUCT_DELAYED);
2626 ret = 1;
2627 }
2628 }
2629 spin_unlock_irq(&gcwq->lock);
2914 2630
2915 /* tell other tasks trying to grab @work to back off */ 2631 return ret;
2916 mark_work_canceling(work); 2632}
2917 local_irq_restore(flags); 2633
2634static bool __cancel_work_timer(struct work_struct *work,
2635 struct timer_list* timer)
2636{
2637 int ret;
2638
2639 do {
2640 ret = (timer && likely(del_timer(timer)));
2641 if (!ret)
2642 ret = try_to_grab_pending(work);
2643 wait_on_work(work);
2644 } while (unlikely(ret < 0));
2918 2645
2919 flush_work(work);
2920 clear_work_data(work); 2646 clear_work_data(work);
2921 return ret; 2647 return ret;
2922} 2648}
@@ -2941,7 +2667,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2941 */ 2667 */
2942bool cancel_work_sync(struct work_struct *work) 2668bool cancel_work_sync(struct work_struct *work)
2943{ 2669{
2944 return __cancel_work_timer(work, false); 2670 return __cancel_work_timer(work, NULL);
2945} 2671}
2946EXPORT_SYMBOL_GPL(cancel_work_sync); 2672EXPORT_SYMBOL_GPL(cancel_work_sync);
2947 2673
@@ -2959,44 +2685,33 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
2959 */ 2685 */
2960bool flush_delayed_work(struct delayed_work *dwork) 2686bool flush_delayed_work(struct delayed_work *dwork)
2961{ 2687{
2962 local_irq_disable();
2963 if (del_timer_sync(&dwork->timer)) 2688 if (del_timer_sync(&dwork->timer))
2964 __queue_work(dwork->cpu, 2689 __queue_work(raw_smp_processor_id(),
2965 get_work_cwq(&dwork->work)->wq, &dwork->work); 2690 get_work_cwq(&dwork->work)->wq, &dwork->work);
2966 local_irq_enable();
2967 return flush_work(&dwork->work); 2691 return flush_work(&dwork->work);
2968} 2692}
2969EXPORT_SYMBOL(flush_delayed_work); 2693EXPORT_SYMBOL(flush_delayed_work);
2970 2694
2971/** 2695/**
2972 * cancel_delayed_work - cancel a delayed work 2696 * flush_delayed_work_sync - wait for a dwork to finish
2973 * @dwork: delayed_work to cancel 2697 * @dwork: the delayed work to flush
2974 * 2698 *
2975 * Kill off a pending delayed_work. Returns %true if @dwork was pending 2699 * Delayed timer is cancelled and the pending work is queued for
2976 * and canceled; %false if wasn't pending. Note that the work callback 2700 * execution immediately. Other than timer handling, its behavior
2977 * function may still be running on return, unless it returns %true and the 2701 * is identical to flush_work_sync().
2978 * work doesn't re-arm itself. Explicitly flush or use
2979 * cancel_delayed_work_sync() to wait on it.
2980 * 2702 *
2981 * This function is safe to call from any context including IRQ handler. 2703 * RETURNS:
2704 * %true if flush_work_sync() waited for the work to finish execution,
2705 * %false if it was already idle.
2982 */ 2706 */
2983bool cancel_delayed_work(struct delayed_work *dwork) 2707bool flush_delayed_work_sync(struct delayed_work *dwork)
2984{ 2708{
2985 unsigned long flags; 2709 if (del_timer_sync(&dwork->timer))
2986 int ret; 2710 __queue_work(raw_smp_processor_id(),
2987 2711 get_work_cwq(&dwork->work)->wq, &dwork->work);
2988 do { 2712 return flush_work_sync(&dwork->work);
2989 ret = try_to_grab_pending(&dwork->work, true, &flags);
2990 } while (unlikely(ret == -EAGAIN));
2991
2992 if (unlikely(ret < 0))
2993 return false;
2994
2995 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
2996 local_irq_restore(flags);
2997 return ret;
2998} 2713}
2999EXPORT_SYMBOL(cancel_delayed_work); 2714EXPORT_SYMBOL(flush_delayed_work_sync);
3000 2715
3001/** 2716/**
3002 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 2717 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -3009,55 +2724,39 @@ EXPORT_SYMBOL(cancel_delayed_work);
3009 */ 2724 */
3010bool cancel_delayed_work_sync(struct delayed_work *dwork) 2725bool cancel_delayed_work_sync(struct delayed_work *dwork)
3011{ 2726{
3012 return __cancel_work_timer(&dwork->work, true); 2727 return __cancel_work_timer(&dwork->work, &dwork->timer);
3013} 2728}
3014EXPORT_SYMBOL(cancel_delayed_work_sync); 2729EXPORT_SYMBOL(cancel_delayed_work_sync);
3015 2730
3016/** 2731/**
3017 * schedule_work_on - put work task on a specific cpu
3018 * @cpu: cpu to put the work task on
3019 * @work: job to be done
3020 *
3021 * This puts a job on a specific cpu
3022 */
3023bool schedule_work_on(int cpu, struct work_struct *work)
3024{
3025 return queue_work_on(cpu, system_wq, work);
3026}
3027EXPORT_SYMBOL(schedule_work_on);
3028
3029/**
3030 * schedule_work - put work task in global workqueue 2732 * schedule_work - put work task in global workqueue
3031 * @work: job to be done 2733 * @work: job to be done
3032 * 2734 *
3033 * Returns %false if @work was already on the kernel-global workqueue and 2735 * Returns zero if @work was already on the kernel-global workqueue and
3034 * %true otherwise. 2736 * non-zero otherwise.
3035 * 2737 *
3036 * This puts a job in the kernel-global workqueue if it was not already 2738 * This puts a job in the kernel-global workqueue if it was not already
3037 * queued and leaves it in the same position on the kernel-global 2739 * queued and leaves it in the same position on the kernel-global
3038 * workqueue otherwise. 2740 * workqueue otherwise.
3039 */ 2741 */
3040bool schedule_work(struct work_struct *work) 2742int schedule_work(struct work_struct *work)
3041{ 2743{
3042 return queue_work(system_wq, work); 2744 return queue_work(system_wq, work);
3043} 2745}
3044EXPORT_SYMBOL(schedule_work); 2746EXPORT_SYMBOL(schedule_work);
3045 2747
3046/** 2748/*
3047 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2749 * schedule_work_on - put work task on a specific cpu
3048 * @cpu: cpu to use 2750 * @cpu: cpu to put the work task on
3049 * @dwork: job to be done 2751 * @work: job to be done
3050 * @delay: number of jiffies to wait
3051 * 2752 *
3052 * After waiting for a given time this puts a job in the kernel-global 2753 * This puts a job on a specific cpu
3053 * workqueue on the specified CPU.
3054 */ 2754 */
3055bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, 2755int schedule_work_on(int cpu, struct work_struct *work)
3056 unsigned long delay)
3057{ 2756{
3058 return queue_delayed_work_on(cpu, system_wq, dwork, delay); 2757 return queue_work_on(cpu, system_wq, work);
3059} 2758}
3060EXPORT_SYMBOL(schedule_delayed_work_on); 2759EXPORT_SYMBOL(schedule_work_on);
3061 2760
3062/** 2761/**
3063 * schedule_delayed_work - put work task in global workqueue after delay 2762 * schedule_delayed_work - put work task in global workqueue after delay
@@ -3067,13 +2766,30 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
3067 * After waiting for a given time this puts a job in the kernel-global 2766 * After waiting for a given time this puts a job in the kernel-global
3068 * workqueue. 2767 * workqueue.
3069 */ 2768 */
3070bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) 2769int schedule_delayed_work(struct delayed_work *dwork,
2770 unsigned long delay)
3071{ 2771{
3072 return queue_delayed_work(system_wq, dwork, delay); 2772 return queue_delayed_work(system_wq, dwork, delay);
3073} 2773}
3074EXPORT_SYMBOL(schedule_delayed_work); 2774EXPORT_SYMBOL(schedule_delayed_work);
3075 2775
3076/** 2776/**
2777 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2778 * @cpu: cpu to use
2779 * @dwork: job to be done
2780 * @delay: number of jiffies to wait
2781 *
2782 * After waiting for a given time this puts a job in the kernel-global
2783 * workqueue on the specified CPU.
2784 */
2785int schedule_delayed_work_on(int cpu,
2786 struct delayed_work *dwork, unsigned long delay)
2787{
2788 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2789}
2790EXPORT_SYMBOL(schedule_delayed_work_on);
2791
2792/**
3077 * schedule_on_each_cpu - execute a function synchronously on each online CPU 2793 * schedule_on_each_cpu - execute a function synchronously on each online CPU
3078 * @func: the function to call 2794 * @func: the function to call
3079 * 2795 *
@@ -3181,8 +2897,13 @@ static int alloc_cwqs(struct workqueue_struct *wq)
3181 const size_t size = sizeof(struct cpu_workqueue_struct); 2897 const size_t size = sizeof(struct cpu_workqueue_struct);
3182 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 2898 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3183 __alignof__(unsigned long long)); 2899 __alignof__(unsigned long long));
2900#ifdef CONFIG_SMP
2901 bool percpu = !(wq->flags & WQ_UNBOUND);
2902#else
2903 bool percpu = false;
2904#endif
3184 2905
3185 if (!(wq->flags & WQ_UNBOUND)) 2906 if (percpu)
3186 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 2907 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
3187 else { 2908 else {
3188 void *ptr; 2909 void *ptr;
@@ -3206,7 +2927,13 @@ static int alloc_cwqs(struct workqueue_struct *wq)
3206 2927
3207static void free_cwqs(struct workqueue_struct *wq) 2928static void free_cwqs(struct workqueue_struct *wq)
3208{ 2929{
3209 if (!(wq->flags & WQ_UNBOUND)) 2930#ifdef CONFIG_SMP
2931 bool percpu = !(wq->flags & WQ_UNBOUND);
2932#else
2933 bool percpu = false;
2934#endif
2935
2936 if (percpu)
3210 free_percpu(wq->cpu_wq.pcpu); 2937 free_percpu(wq->cpu_wq.pcpu);
3211 else if (wq->cpu_wq.single) { 2938 else if (wq->cpu_wq.single) {
3212 /* the pointer to free is stored right after the cwq */ 2939 /* the pointer to free is stored right after the cwq */
@@ -3220,35 +2947,21 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
3220 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; 2947 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3221 2948
3222 if (max_active < 1 || max_active > lim) 2949 if (max_active < 1 || max_active > lim)
3223 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", 2950 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
3224 max_active, name, 1, lim); 2951 "is out of range, clamping between %d and %d\n",
2952 max_active, name, 1, lim);
3225 2953
3226 return clamp_val(max_active, 1, lim); 2954 return clamp_val(max_active, 1, lim);
3227} 2955}
3228 2956
3229struct workqueue_struct *__alloc_workqueue_key(const char *fmt, 2957struct workqueue_struct *__alloc_workqueue_key(const char *name,
3230 unsigned int flags, 2958 unsigned int flags,
3231 int max_active, 2959 int max_active,
3232 struct lock_class_key *key, 2960 struct lock_class_key *key,
3233 const char *lock_name, ...) 2961 const char *lock_name)
3234{ 2962{
3235 va_list args, args1;
3236 struct workqueue_struct *wq; 2963 struct workqueue_struct *wq;
3237 unsigned int cpu; 2964 unsigned int cpu;
3238 size_t namelen;
3239
3240 /* determine namelen, allocate wq and format name */
3241 va_start(args, lock_name);
3242 va_copy(args1, args);
3243 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3244
3245 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
3246 if (!wq)
3247 goto err;
3248
3249 vsnprintf(wq->name, namelen, fmt, args1);
3250 va_end(args);
3251 va_end(args1);
3252 2965
3253 /* 2966 /*
3254 * Workqueues which may be used during memory reclaim should 2967 * Workqueues which may be used during memory reclaim should
@@ -3257,10 +2970,20 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3257 if (flags & WQ_MEM_RECLAIM) 2970 if (flags & WQ_MEM_RECLAIM)
3258 flags |= WQ_RESCUER; 2971 flags |= WQ_RESCUER;
3259 2972
2973 /*
2974 * Unbound workqueues aren't concurrency managed and should be
2975 * dispatched to workers immediately.
2976 */
2977 if (flags & WQ_UNBOUND)
2978 flags |= WQ_HIGHPRI;
2979
3260 max_active = max_active ?: WQ_DFL_ACTIVE; 2980 max_active = max_active ?: WQ_DFL_ACTIVE;
3261 max_active = wq_clamp_max_active(max_active, flags, wq->name); 2981 max_active = wq_clamp_max_active(max_active, flags, name);
2982
2983 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2984 if (!wq)
2985 goto err;
3262 2986
3263 /* init wq */
3264 wq->flags = flags; 2987 wq->flags = flags;
3265 wq->saved_max_active = max_active; 2988 wq->saved_max_active = max_active;
3266 mutex_init(&wq->flush_mutex); 2989 mutex_init(&wq->flush_mutex);
@@ -3268,6 +2991,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3268 INIT_LIST_HEAD(&wq->flusher_queue); 2991 INIT_LIST_HEAD(&wq->flusher_queue);
3269 INIT_LIST_HEAD(&wq->flusher_overflow); 2992 INIT_LIST_HEAD(&wq->flusher_overflow);
3270 2993
2994 wq->name = name;
3271 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2995 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3272 INIT_LIST_HEAD(&wq->list); 2996 INIT_LIST_HEAD(&wq->list);
3273 2997
@@ -3277,10 +3001,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3277 for_each_cwq_cpu(cpu, wq) { 3001 for_each_cwq_cpu(cpu, wq) {
3278 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3002 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3279 struct global_cwq *gcwq = get_gcwq(cpu); 3003 struct global_cwq *gcwq = get_gcwq(cpu);
3280 int pool_idx = (bool)(flags & WQ_HIGHPRI);
3281 3004
3282 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3005 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3283 cwq->pool = &gcwq->pools[pool_idx]; 3006 cwq->gcwq = gcwq;
3284 cwq->wq = wq; 3007 cwq->wq = wq;
3285 cwq->flush_color = -1; 3008 cwq->flush_color = -1;
3286 cwq->max_active = max_active; 3009 cwq->max_active = max_active;
@@ -3297,8 +3020,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3297 if (!rescuer) 3020 if (!rescuer)
3298 goto err; 3021 goto err;
3299 3022
3300 rescuer->task = kthread_create(rescuer_thread, wq, "%s", 3023 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
3301 wq->name);
3302 if (IS_ERR(rescuer->task)) 3024 if (IS_ERR(rescuer->task))
3303 goto err; 3025 goto err;
3304 3026
@@ -3377,26 +3099,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
3377EXPORT_SYMBOL_GPL(destroy_workqueue); 3099EXPORT_SYMBOL_GPL(destroy_workqueue);
3378 3100
3379/** 3101/**
3380 * cwq_set_max_active - adjust max_active of a cwq
3381 * @cwq: target cpu_workqueue_struct
3382 * @max_active: new max_active value.
3383 *
3384 * Set @cwq->max_active to @max_active and activate delayed works if
3385 * increased.
3386 *
3387 * CONTEXT:
3388 * spin_lock_irq(gcwq->lock).
3389 */
3390static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
3391{
3392 cwq->max_active = max_active;
3393
3394 while (!list_empty(&cwq->delayed_works) &&
3395 cwq->nr_active < cwq->max_active)
3396 cwq_activate_first_delayed(cwq);
3397}
3398
3399/**
3400 * workqueue_set_max_active - adjust max_active of a workqueue 3102 * workqueue_set_max_active - adjust max_active of a workqueue
3401 * @wq: target workqueue 3103 * @wq: target workqueue
3402 * @max_active: new max_active value. 3104 * @max_active: new max_active value.
@@ -3423,7 +3125,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3423 3125
3424 if (!(wq->flags & WQ_FREEZABLE) || 3126 if (!(wq->flags & WQ_FREEZABLE) ||
3425 !(gcwq->flags & GCWQ_FREEZING)) 3127 !(gcwq->flags & GCWQ_FREEZING))
3426 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); 3128 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3427 3129
3428 spin_unlock_irq(&gcwq->lock); 3130 spin_unlock_irq(&gcwq->lock);
3429 } 3131 }
@@ -3487,7 +3189,7 @@ unsigned int work_busy(struct work_struct *work)
3487 unsigned int ret = 0; 3189 unsigned int ret = 0;
3488 3190
3489 if (!gcwq) 3191 if (!gcwq)
3490 return 0; 3192 return false;
3491 3193
3492 spin_lock_irqsave(&gcwq->lock, flags); 3194 spin_lock_irqsave(&gcwq->lock, flags);
3493 3195
@@ -3512,159 +3214,386 @@ EXPORT_SYMBOL_GPL(work_busy);
3512 * gcwqs serve mix of short, long and very long running works making 3214 * gcwqs serve mix of short, long and very long running works making
3513 * blocked draining impractical. 3215 * blocked draining impractical.
3514 * 3216 *
3515 * This is solved by allowing a gcwq to be disassociated from the CPU 3217 * This is solved by allowing a gcwq to be detached from CPU, running
3516 * running as an unbound one and allowing it to be reattached later if the 3218 * it with unbound (rogue) workers and allowing it to be reattached
3517 * cpu comes back online. 3219 * later if the cpu comes back online. A separate thread is created
3220 * to govern a gcwq in such state and is called the trustee of the
3221 * gcwq.
3222 *
3223 * Trustee states and their descriptions.
3224 *
3225 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3226 * new trustee is started with this state.
3227 *
3228 * IN_CHARGE Once started, trustee will enter this state after
3229 * assuming the manager role and making all existing
3230 * workers rogue. DOWN_PREPARE waits for trustee to
3231 * enter this state. After reaching IN_CHARGE, trustee
3232 * tries to execute the pending worklist until it's empty
3233 * and the state is set to BUTCHER, or the state is set
3234 * to RELEASE.
3235 *
3236 * BUTCHER Command state which is set by the cpu callback after
3237 * the cpu has went down. Once this state is set trustee
3238 * knows that there will be no new works on the worklist
3239 * and once the worklist is empty it can proceed to
3240 * killing idle workers.
3241 *
3242 * RELEASE Command state which is set by the cpu callback if the
3243 * cpu down has been canceled or it has come online
3244 * again. After recognizing this state, trustee stops
3245 * trying to drain or butcher and clears ROGUE, rebinds
3246 * all remaining workers back to the cpu and releases
3247 * manager role.
3248 *
3249 * DONE Trustee will enter this state after BUTCHER or RELEASE
3250 * is complete.
3251 *
3252 * trustee CPU draining
3253 * took over down complete
3254 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3255 * | | ^
3256 * | CPU is back online v return workers |
3257 * ----------------> RELEASE --------------
3518 */ 3258 */
3519 3259
3520/* claim manager positions of all pools */ 3260/**
3521static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) 3261 * trustee_wait_event_timeout - timed event wait for trustee
3522{ 3262 * @cond: condition to wait for
3523 struct worker_pool *pool; 3263 * @timeout: timeout in jiffies
3524 3264 *
3525 for_each_worker_pool(pool, gcwq) 3265 * wait_event_timeout() for trustee to use. Handles locking and
3526 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); 3266 * checks for RELEASE request.
3527 spin_lock_irq(&gcwq->lock); 3267 *
3528} 3268 * CONTEXT:
3529 3269 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3530/* release manager positions */ 3270 * multiple times. To be used by trustee.
3531static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) 3271 *
3532{ 3272 * RETURNS:
3533 struct worker_pool *pool; 3273 * Positive indicating left time if @cond is satisfied, 0 if timed
3274 * out, -1 if canceled.
3275 */
3276#define trustee_wait_event_timeout(cond, timeout) ({ \
3277 long __ret = (timeout); \
3278 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3279 __ret) { \
3280 spin_unlock_irq(&gcwq->lock); \
3281 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3282 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3283 __ret); \
3284 spin_lock_irq(&gcwq->lock); \
3285 } \
3286 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3287})
3534 3288
3535 spin_unlock_irq(&gcwq->lock); 3289/**
3536 for_each_worker_pool(pool, gcwq) 3290 * trustee_wait_event - event wait for trustee
3537 mutex_unlock(&pool->assoc_mutex); 3291 * @cond: condition to wait for
3538} 3292 *
3293 * wait_event() for trustee to use. Automatically handles locking and
3294 * checks for CANCEL request.
3295 *
3296 * CONTEXT:
3297 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3298 * multiple times. To be used by trustee.
3299 *
3300 * RETURNS:
3301 * 0 if @cond is satisfied, -1 if canceled.
3302 */
3303#define trustee_wait_event(cond) ({ \
3304 long __ret1; \
3305 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3306 __ret1 < 0 ? -1 : 0; \
3307})
3539 3308
3540static void gcwq_unbind_fn(struct work_struct *work) 3309static int __cpuinit trustee_thread(void *__gcwq)
3541{ 3310{
3542 struct global_cwq *gcwq = get_gcwq(smp_processor_id()); 3311 struct global_cwq *gcwq = __gcwq;
3543 struct worker_pool *pool;
3544 struct worker *worker; 3312 struct worker *worker;
3313 struct work_struct *work;
3545 struct hlist_node *pos; 3314 struct hlist_node *pos;
3315 long rc;
3546 int i; 3316 int i;
3547 3317
3548 BUG_ON(gcwq->cpu != smp_processor_id()); 3318 BUG_ON(gcwq->cpu != smp_processor_id());
3549 3319
3550 gcwq_claim_assoc_and_lock(gcwq); 3320 spin_lock_irq(&gcwq->lock);
3551
3552 /* 3321 /*
3553 * We've claimed all manager positions. Make all workers unbound 3322 * Claim the manager position and make all workers rogue.
3554 * and set DISASSOCIATED. Before this, all workers except for the 3323 * Trustee must be bound to the target cpu and can't be
3555 * ones which are still executing works from before the last CPU 3324 * cancelled.
3556 * down must be on the cpu. After this, they may become diasporas.
3557 */ 3325 */
3558 for_each_worker_pool(pool, gcwq) 3326 BUG_ON(gcwq->cpu != smp_processor_id());
3559 list_for_each_entry(worker, &pool->idle_list, entry) 3327 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3560 worker->flags |= WORKER_UNBOUND; 3328 BUG_ON(rc < 0);
3561 3329
3562 for_each_busy_worker(worker, i, pos, gcwq) 3330 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3563 worker->flags |= WORKER_UNBOUND;
3564 3331
3565 gcwq->flags |= GCWQ_DISASSOCIATED; 3332 list_for_each_entry(worker, &gcwq->idle_list, entry)
3333 worker->flags |= WORKER_ROGUE;
3566 3334
3567 gcwq_release_assoc_and_unlock(gcwq); 3335 for_each_busy_worker(worker, i, pos, gcwq)
3336 worker->flags |= WORKER_ROGUE;
3568 3337
3569 /* 3338 /*
3570 * Call schedule() so that we cross rq->lock and thus can guarantee 3339 * Call schedule() so that we cross rq->lock and thus can
3571 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary 3340 * guarantee sched callbacks see the rogue flag. This is
3572 * as scheduler callbacks may be invoked from other cpus. 3341 * necessary as scheduler callbacks may be invoked from other
3342 * cpus.
3573 */ 3343 */
3344 spin_unlock_irq(&gcwq->lock);
3574 schedule(); 3345 schedule();
3346 spin_lock_irq(&gcwq->lock);
3575 3347
3576 /* 3348 /*
3577 * Sched callbacks are disabled now. Zap nr_running. After this, 3349 * Sched callbacks are disabled now. Zap nr_running. After
3578 * nr_running stays zero and need_more_worker() and keep_working() 3350 * this, nr_running stays zero and need_more_worker() and
3579 * are always true as long as the worklist is not empty. @gcwq now 3351 * keep_working() are always true as long as the worklist is
3580 * behaves as unbound (in terms of concurrency management) gcwq 3352 * not empty.
3581 * which is served by workers tied to the CPU. 3353 */
3582 * 3354 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3583 * On return from this function, the current worker would trigger 3355
3584 * unbound chain execution of pending work items if other workers 3356 spin_unlock_irq(&gcwq->lock);
3585 * didn't already. 3357 del_timer_sync(&gcwq->idle_timer);
3358 spin_lock_irq(&gcwq->lock);
3359
3360 /*
3361 * We're now in charge. Notify and proceed to drain. We need
3362 * to keep the gcwq running during the whole CPU down
3363 * procedure as other cpu hotunplug callbacks may need to
3364 * flush currently running tasks.
3365 */
3366 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3367 wake_up_all(&gcwq->trustee_wait);
3368
3369 /*
3370 * The original cpu is in the process of dying and may go away
3371 * anytime now. When that happens, we and all workers would
3372 * be migrated to other cpus. Try draining any left work. We
3373 * want to get it over with ASAP - spam rescuers, wake up as
3374 * many idlers as necessary and create new ones till the
3375 * worklist is empty. Note that if the gcwq is frozen, there
3376 * may be frozen works in freezable cwqs. Don't declare
3377 * completion while frozen.
3378 */
3379 while (gcwq->nr_workers != gcwq->nr_idle ||
3380 gcwq->flags & GCWQ_FREEZING ||
3381 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3382 int nr_works = 0;
3383
3384 list_for_each_entry(work, &gcwq->worklist, entry) {
3385 send_mayday(work);
3386 nr_works++;
3387 }
3388
3389 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3390 if (!nr_works--)
3391 break;
3392 wake_up_process(worker->task);
3393 }
3394
3395 if (need_to_create_worker(gcwq)) {
3396 spin_unlock_irq(&gcwq->lock);
3397 worker = create_worker(gcwq, false);
3398 spin_lock_irq(&gcwq->lock);
3399 if (worker) {
3400 worker->flags |= WORKER_ROGUE;
3401 start_worker(worker);
3402 }
3403 }
3404
3405 /* give a breather */
3406 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3407 break;
3408 }
3409
3410 /*
3411 * Either all works have been scheduled and cpu is down, or
3412 * cpu down has already been canceled. Wait for and butcher
3413 * all workers till we're canceled.
3586 */ 3414 */
3587 for_each_worker_pool(pool, gcwq) 3415 do {
3588 atomic_set(get_pool_nr_running(pool), 0); 3416 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3417 while (!list_empty(&gcwq->idle_list))
3418 destroy_worker(list_first_entry(&gcwq->idle_list,
3419 struct worker, entry));
3420 } while (gcwq->nr_workers && rc >= 0);
3421
3422 /*
3423 * At this point, either draining has completed and no worker
3424 * is left, or cpu down has been canceled or the cpu is being
3425 * brought back up. There shouldn't be any idle one left.
3426 * Tell the remaining busy ones to rebind once it finishes the
3427 * currently scheduled works by scheduling the rebind_work.
3428 */
3429 WARN_ON(!list_empty(&gcwq->idle_list));
3430
3431 for_each_busy_worker(worker, i, pos, gcwq) {
3432 struct work_struct *rebind_work = &worker->rebind_work;
3433
3434 /*
3435 * Rebind_work may race with future cpu hotplug
3436 * operations. Use a separate flag to mark that
3437 * rebinding is scheduled.
3438 */
3439 worker->flags |= WORKER_REBIND;
3440 worker->flags &= ~WORKER_ROGUE;
3441
3442 /* queue rebind_work, wq doesn't matter, use the default one */
3443 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3444 work_data_bits(rebind_work)))
3445 continue;
3446
3447 debug_work_activate(rebind_work);
3448 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3449 worker->scheduled.next,
3450 work_color_to_flags(WORK_NO_COLOR));
3451 }
3452
3453 /* relinquish manager role */
3454 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3455
3456 /* notify completion */
3457 gcwq->trustee = NULL;
3458 gcwq->trustee_state = TRUSTEE_DONE;
3459 wake_up_all(&gcwq->trustee_wait);
3460 spin_unlock_irq(&gcwq->lock);
3461 return 0;
3589} 3462}
3590 3463
3591/* 3464/**
3592 * Workqueues should be brought up before normal priority CPU notifiers. 3465 * wait_trustee_state - wait for trustee to enter the specified state
3593 * This will be registered high priority CPU notifier. 3466 * @gcwq: gcwq the trustee of interest belongs to
3467 * @state: target state to wait for
3468 *
3469 * Wait for the trustee to reach @state. DONE is already matched.
3470 *
3471 * CONTEXT:
3472 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3473 * multiple times. To be used by cpu_callback.
3594 */ 3474 */
3595static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, 3475static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3596 unsigned long action, 3476__releases(&gcwq->lock)
3597 void *hcpu) 3477__acquires(&gcwq->lock)
3478{
3479 if (!(gcwq->trustee_state == state ||
3480 gcwq->trustee_state == TRUSTEE_DONE)) {
3481 spin_unlock_irq(&gcwq->lock);
3482 __wait_event(gcwq->trustee_wait,
3483 gcwq->trustee_state == state ||
3484 gcwq->trustee_state == TRUSTEE_DONE);
3485 spin_lock_irq(&gcwq->lock);
3486 }
3487}
3488
3489static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3490 unsigned long action,
3491 void *hcpu)
3598{ 3492{
3599 unsigned int cpu = (unsigned long)hcpu; 3493 unsigned int cpu = (unsigned long)hcpu;
3600 struct global_cwq *gcwq = get_gcwq(cpu); 3494 struct global_cwq *gcwq = get_gcwq(cpu);
3601 struct worker_pool *pool; 3495 struct task_struct *new_trustee = NULL;
3496 struct worker *uninitialized_var(new_worker);
3497 unsigned long flags;
3498
3499 action &= ~CPU_TASKS_FROZEN;
3602 3500
3603 switch (action & ~CPU_TASKS_FROZEN) { 3501 switch (action) {
3502 case CPU_DOWN_PREPARE:
3503 new_trustee = kthread_create(trustee_thread, gcwq,
3504 "workqueue_trustee/%d\n", cpu);
3505 if (IS_ERR(new_trustee))
3506 return notifier_from_errno(PTR_ERR(new_trustee));
3507 kthread_bind(new_trustee, cpu);
3508 /* fall through */
3604 case CPU_UP_PREPARE: 3509 case CPU_UP_PREPARE:
3605 for_each_worker_pool(pool, gcwq) { 3510 BUG_ON(gcwq->first_idle);
3606 struct worker *worker; 3511 new_worker = create_worker(gcwq, false);
3512 if (!new_worker) {
3513 if (new_trustee)
3514 kthread_stop(new_trustee);
3515 return NOTIFY_BAD;
3516 }
3517 }
3607 3518
3608 if (pool->nr_workers) 3519 /* some are called w/ irq disabled, don't disturb irq status */
3609 continue; 3520 spin_lock_irqsave(&gcwq->lock, flags);
3610 3521
3611 worker = create_worker(pool); 3522 switch (action) {
3612 if (!worker) 3523 case CPU_DOWN_PREPARE:
3613 return NOTIFY_BAD; 3524 /* initialize trustee and tell it to acquire the gcwq */
3525 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3526 gcwq->trustee = new_trustee;
3527 gcwq->trustee_state = TRUSTEE_START;
3528 wake_up_process(gcwq->trustee);
3529 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3530 /* fall through */
3531 case CPU_UP_PREPARE:
3532 BUG_ON(gcwq->first_idle);
3533 gcwq->first_idle = new_worker;
3534 break;
3614 3535
3615 spin_lock_irq(&gcwq->lock); 3536 case CPU_DYING:
3616 start_worker(worker); 3537 /*
3617 spin_unlock_irq(&gcwq->lock); 3538 * Before this, the trustee and all workers except for
3618 } 3539 * the ones which are still executing works from
3540 * before the last CPU down must be on the cpu. After
3541 * this, they'll all be diasporas.
3542 */
3543 gcwq->flags |= GCWQ_DISASSOCIATED;
3544 break;
3545
3546 case CPU_POST_DEAD:
3547 gcwq->trustee_state = TRUSTEE_BUTCHER;
3548 /* fall through */
3549 case CPU_UP_CANCELED:
3550 destroy_worker(gcwq->first_idle);
3551 gcwq->first_idle = NULL;
3619 break; 3552 break;
3620 3553
3621 case CPU_DOWN_FAILED: 3554 case CPU_DOWN_FAILED:
3622 case CPU_ONLINE: 3555 case CPU_ONLINE:
3623 gcwq_claim_assoc_and_lock(gcwq);
3624 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3556 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3625 rebind_workers(gcwq); 3557 if (gcwq->trustee_state != TRUSTEE_DONE) {
3626 gcwq_release_assoc_and_unlock(gcwq); 3558 gcwq->trustee_state = TRUSTEE_RELEASE;
3559 wake_up_process(gcwq->trustee);
3560 wait_trustee_state(gcwq, TRUSTEE_DONE);
3561 }
3562
3563 /*
3564 * Trustee is done and there might be no worker left.
3565 * Put the first_idle in and request a real manager to
3566 * take a look.
3567 */
3568 spin_unlock_irq(&gcwq->lock);
3569 kthread_bind(gcwq->first_idle->task, cpu);
3570 spin_lock_irq(&gcwq->lock);
3571 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3572 start_worker(gcwq->first_idle);
3573 gcwq->first_idle = NULL;
3627 break; 3574 break;
3628 } 3575 }
3629 return NOTIFY_OK;
3630}
3631 3576
3632/* 3577 spin_unlock_irqrestore(&gcwq->lock, flags);
3633 * Workqueues should be brought down after normal priority CPU notifiers.
3634 * This will be registered as low priority CPU notifier.
3635 */
3636static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3637 unsigned long action,
3638 void *hcpu)
3639{
3640 unsigned int cpu = (unsigned long)hcpu;
3641 struct work_struct unbind_work;
3642 3578
3643 switch (action & ~CPU_TASKS_FROZEN) { 3579 return notifier_from_errno(0);
3644 case CPU_DOWN_PREPARE:
3645 /* unbinding should happen on the local CPU */
3646 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3647 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3648 flush_work(&unbind_work);
3649 break;
3650 }
3651 return NOTIFY_OK;
3652} 3580}
3653 3581
3654#ifdef CONFIG_SMP 3582#ifdef CONFIG_SMP
3655 3583
3656struct work_for_cpu { 3584struct work_for_cpu {
3657 struct work_struct work; 3585 struct completion completion;
3658 long (*fn)(void *); 3586 long (*fn)(void *);
3659 void *arg; 3587 void *arg;
3660 long ret; 3588 long ret;
3661}; 3589};
3662 3590
3663static void work_for_cpu_fn(struct work_struct *work) 3591static int do_work_for_cpu(void *_wfc)
3664{ 3592{
3665 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); 3593 struct work_for_cpu *wfc = _wfc;
3666
3667 wfc->ret = wfc->fn(wfc->arg); 3594 wfc->ret = wfc->fn(wfc->arg);
3595 complete(&wfc->completion);
3596 return 0;
3668} 3597}
3669 3598
3670/** 3599/**
@@ -3679,11 +3608,19 @@ static void work_for_cpu_fn(struct work_struct *work)
3679 */ 3608 */
3680long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 3609long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3681{ 3610{
3682 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 3611 struct task_struct *sub_thread;
3612 struct work_for_cpu wfc = {
3613 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
3614 .fn = fn,
3615 .arg = arg,
3616 };
3683 3617
3684 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 3618 sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
3685 schedule_work_on(cpu, &wfc.work); 3619 if (IS_ERR(sub_thread))
3686 flush_work(&wfc.work); 3620 return PTR_ERR(sub_thread);
3621 kthread_bind(sub_thread, cpu);
3622 wake_up_process(sub_thread);
3623 wait_for_completion(&wfc.completion);
3687 return wfc.ret; 3624 return wfc.ret;
3688} 3625}
3689EXPORT_SYMBOL_GPL(work_on_cpu); 3626EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -3798,7 +3735,6 @@ void thaw_workqueues(void)
3798 3735
3799 for_each_gcwq_cpu(cpu) { 3736 for_each_gcwq_cpu(cpu) {
3800 struct global_cwq *gcwq = get_gcwq(cpu); 3737 struct global_cwq *gcwq = get_gcwq(cpu);
3801 struct worker_pool *pool;
3802 struct workqueue_struct *wq; 3738 struct workqueue_struct *wq;
3803 3739
3804 spin_lock_irq(&gcwq->lock); 3740 spin_lock_irq(&gcwq->lock);
@@ -3813,11 +3749,14 @@ void thaw_workqueues(void)
3813 continue; 3749 continue;
3814 3750
3815 /* restore max_active and repopulate worklist */ 3751 /* restore max_active and repopulate worklist */
3816 cwq_set_max_active(cwq, wq->saved_max_active); 3752 cwq->max_active = wq->saved_max_active;
3753
3754 while (!list_empty(&cwq->delayed_works) &&
3755 cwq->nr_active < cwq->max_active)
3756 cwq_activate_first_delayed(cwq);
3817 } 3757 }
3818 3758
3819 for_each_worker_pool(pool, gcwq) 3759 wake_up_worker(gcwq);
3820 wake_up_worker(pool);
3821 3760
3822 spin_unlock_irq(&gcwq->lock); 3761 spin_unlock_irq(&gcwq->lock);
3823 } 3762 }
@@ -3833,69 +3772,56 @@ static int __init init_workqueues(void)
3833 unsigned int cpu; 3772 unsigned int cpu;
3834 int i; 3773 int i;
3835 3774
3836 /* make sure we have enough bits for OFFQ CPU number */ 3775 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3837 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
3838 WORK_CPU_LAST);
3839
3840 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3841 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3842 3776
3843 /* initialize gcwqs */ 3777 /* initialize gcwqs */
3844 for_each_gcwq_cpu(cpu) { 3778 for_each_gcwq_cpu(cpu) {
3845 struct global_cwq *gcwq = get_gcwq(cpu); 3779 struct global_cwq *gcwq = get_gcwq(cpu);
3846 struct worker_pool *pool;
3847 3780
3848 spin_lock_init(&gcwq->lock); 3781 spin_lock_init(&gcwq->lock);
3782 INIT_LIST_HEAD(&gcwq->worklist);
3849 gcwq->cpu = cpu; 3783 gcwq->cpu = cpu;
3850 gcwq->flags |= GCWQ_DISASSOCIATED; 3784 gcwq->flags |= GCWQ_DISASSOCIATED;
3851 3785
3786 INIT_LIST_HEAD(&gcwq->idle_list);
3852 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) 3787 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3853 INIT_HLIST_HEAD(&gcwq->busy_hash[i]); 3788 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3854 3789
3855 for_each_worker_pool(pool, gcwq) { 3790 init_timer_deferrable(&gcwq->idle_timer);
3856 pool->gcwq = gcwq; 3791 gcwq->idle_timer.function = idle_worker_timeout;
3857 INIT_LIST_HEAD(&pool->worklist); 3792 gcwq->idle_timer.data = (unsigned long)gcwq;
3858 INIT_LIST_HEAD(&pool->idle_list);
3859 3793
3860 init_timer_deferrable(&pool->idle_timer); 3794 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3861 pool->idle_timer.function = idle_worker_timeout; 3795 (unsigned long)gcwq);
3862 pool->idle_timer.data = (unsigned long)pool;
3863 3796
3864 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3797 ida_init(&gcwq->worker_ida);
3865 (unsigned long)pool);
3866 3798
3867 mutex_init(&pool->assoc_mutex); 3799 gcwq->trustee_state = TRUSTEE_DONE;
3868 ida_init(&pool->worker_ida); 3800 init_waitqueue_head(&gcwq->trustee_wait);
3869 }
3870 } 3801 }
3871 3802
3872 /* create the initial worker */ 3803 /* create the initial worker */
3873 for_each_online_gcwq_cpu(cpu) { 3804 for_each_online_gcwq_cpu(cpu) {
3874 struct global_cwq *gcwq = get_gcwq(cpu); 3805 struct global_cwq *gcwq = get_gcwq(cpu);
3875 struct worker_pool *pool; 3806 struct worker *worker;
3876 3807
3877 if (cpu != WORK_CPU_UNBOUND) 3808 if (cpu != WORK_CPU_UNBOUND)
3878 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3809 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3879 3810 worker = create_worker(gcwq, true);
3880 for_each_worker_pool(pool, gcwq) { 3811 BUG_ON(!worker);
3881 struct worker *worker; 3812 spin_lock_irq(&gcwq->lock);
3882 3813 start_worker(worker);
3883 worker = create_worker(pool); 3814 spin_unlock_irq(&gcwq->lock);
3884 BUG_ON(!worker);
3885 spin_lock_irq(&gcwq->lock);
3886 start_worker(worker);
3887 spin_unlock_irq(&gcwq->lock);
3888 }
3889 } 3815 }
3890 3816
3891 system_wq = alloc_workqueue("events", 0, 0); 3817 system_wq = alloc_workqueue("events", 0, 0);
3892 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3893 system_long_wq = alloc_workqueue("events_long", 0, 0); 3818 system_long_wq = alloc_workqueue("events_long", 0, 0);
3819 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3894 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3820 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3895 WQ_UNBOUND_MAX_ACTIVE); 3821 WQ_UNBOUND_MAX_ACTIVE);
3896 system_freezable_wq = alloc_workqueue("events_freezable", 3822 system_freezable_wq = alloc_workqueue("events_freezable",
3897 WQ_FREEZABLE, 0); 3823 WQ_FREEZABLE, 0);
3898 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 3824 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3899 !system_unbound_wq || !system_freezable_wq); 3825 !system_unbound_wq || !system_freezable_wq);
3900 return 0; 3826 return 0;
3901} 3827}