aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-12-30 19:10:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-12-30 19:10:19 -0500
commit5f34fe1cfc1bdd8b4711bbe37421fba4ed0d1ed4 (patch)
tree85b21c8bb0e53005bd970d648ca093acfd0584a3
parenteca1bf5b4fab56d2feb1572d34d59fcd92ea7df3 (diff)
parent6638101c1124c19c8a65b1645e4ecd09e0572f3e (diff)
Merge branch 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (63 commits) stacktrace: provide save_stack_trace_tsk() weak alias rcu: provide RCU options on non-preempt architectures too printk: fix discarding message when recursion_bug futex: clean up futex_(un)lock_pi fault handling "Tree RCU": scalable classic RCU implementation futex: rename field in futex_q to clarify single waiter semantics x86/swiotlb: add default swiotlb_arch_range_needs_mapping x86/swiotlb: add default phys<->bus conversion x86: unify pci iommu setup and allow swiotlb to compile for 32 bit x86: add swiotlb allocation functions swiotlb: consolidate swiotlb info message printing swiotlb: support bouncing of HighMem pages swiotlb: factor out copy to/from device swiotlb: add arch hook to force mapping swiotlb: allow architectures to override phys<->bus<->phys conversions swiotlb: add comment where we handle the overflow of a dma mask on 32 bit rcu: fix rcutorture behavior during reboot resources: skip sanity check of busy resources swiotlb: move some definitions to header swiotlb: allow architectures to override swiotlb pool allocation ... Fix up trivial conflicts in arch/x86/kernel/Makefile arch/x86/mm/init_32.c include/linux/hardirq.h as per Ingo's suggestions.
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/trace.txt413
-rw-r--r--Documentation/lockstat.txt51
-rw-r--r--arch/powerpc/platforms/pseries/rtasd.c4
-rw-r--r--arch/um/include/asm/system.h14
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pci_64.h1
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/pci-dma.c13
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--include/asm-generic/bug.h7
-rw-r--r--include/linux/bottom_half.h1
-rw-r--r--include/linux/debug_locks.h2
-rw-r--r--include/linux/futex.h5
-rw-r--r--include/linux/hardirq.h13
-rw-r--r--include/linux/kernel.h11
-rw-r--r--include/linux/lockdep.h43
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/rcuclassic.h2
-rw-r--r--include/linux/rcupdate.h10
-rw-r--r--include/linux/rcutree.h329
-rw-r--r--include/linux/swiotlb.h22
-rw-r--r--include/linux/uaccess.h2
-rw-r--r--init/Kconfig86
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c16
-rw-r--r--kernel/futex.c351
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/lockdep.c60
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupreempt.c10
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c1535
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/resource.c9
-rw-r--r--kernel/sched.c3
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/sys.c2
-rw-r--r--lib/Kconfig.debug31
-rw-r--r--lib/debugobjects.c4
-rw-r--r--lib/swiotlb.c255
-rw-r--r--mm/memory.c15
61 files changed, 3424 insertions, 487 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 461481dfb7c3..7dc0695a8f90 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -16,6 +16,8 @@ RTFP.txt
16 - List of RCU papers (bibliography) going back to 1980. 16 - List of RCU papers (bibliography) going back to 1980.
17torture.txt 17torture.txt
18 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) 18 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
19trace.txt
20 - CONFIG_RCU_TRACE debugfs files and formats
19UP.txt 21UP.txt
20 - RCU on Uniprocessor Systems 22 - RCU on Uniprocessor Systems
21whatisRCU.txt 23whatisRCU.txt
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
new file mode 100644
index 000000000000..068848240a8b
--- /dev/null
+++ b/Documentation/RCU/trace.txt
@@ -0,0 +1,413 @@
1CONFIG_RCU_TRACE debugfs Files and Formats
2
3
4The rcupreempt and rcutree implementations of RCU provide debugfs trace
5output that summarizes counters and state. This information is useful for
6debugging RCU itself, and can sometimes also help to debug abuses of RCU.
7Note that the rcuclassic implementation of RCU does not provide debugfs
8trace output.
9
10The following sections describe the debugfs files and formats for
11preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
12
13
14Preemptable RCU debugfs Files and Formats
15
16This implementation of RCU provides three debugfs files under the
17top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
18counters used by preemptable RCU) rcu/rcugp (which displays grace-period
19counters), and rcu/rcustats (which internal counters for debugging RCU).
20
21The output of "cat rcu/rcuctrs" looks as follows:
22
23CPU last cur F M
24 0 5 -5 0 0
25 1 -1 0 0 0
26 2 0 1 0 0
27 3 0 1 0 0
28 4 0 1 0 0
29 5 0 1 0 0
30 6 0 2 0 0
31 7 0 -1 0 0
32 8 0 1 0 0
33ggp = 26226, state = waitzero
34
35The per-CPU fields are as follows:
36
37o "CPU" gives the CPU number. Offline CPUs are not displayed.
38
39o "last" gives the value of the counter that is being decremented
40 for the current grace period phase. In the example above,
41 the counters sum to 4, indicating that there are still four
42 RCU read-side critical sections still running that started
43 before the last counter flip.
44
45o "cur" gives the value of the counter that is currently being
46 both incremented (by rcu_read_lock()) and decremented (by
47 rcu_read_unlock()). In the example above, the counters sum to
48 1, indicating that there is only one RCU read-side critical section
49 still running that started after the last counter flip.
50
51o "F" indicates whether RCU is waiting for this CPU to acknowledge
52 a counter flip. In the above example, RCU is not waiting on any,
53 which is consistent with the state being "waitzero" rather than
54 "waitack".
55
56o "M" indicates whether RCU is waiting for this CPU to execute a
57 memory barrier. In the above example, RCU is not waiting on any,
58 which is consistent with the state being "waitzero" rather than
59 "waitmb".
60
61o "ggp" is the global grace-period counter.
62
63o "state" is the RCU state, which can be one of the following:
64
65 o "idle": there is no grace period in progress.
66
67 o "waitack": RCU just incremented the global grace-period
68 counter, which has the effect of reversing the roles of
69 the "last" and "cur" counters above, and is waiting for
70 all the CPUs to acknowledge the flip. Once the flip has
71 been acknowledged, CPUs will no longer be incrementing
72 what are now the "last" counters, so that their sum will
73 decrease monotonically down to zero.
74
75 o "waitzero": RCU is waiting for the sum of the "last" counters
76 to decrease to zero.
77
78 o "waitmb": RCU is waiting for each CPU to execute a memory
79 barrier, which ensures that instructions from a given CPU's
80 last RCU read-side critical section cannot be reordered
81 with instructions following the memory-barrier instruction.
82
83The output of "cat rcu/rcugp" looks as follows:
84
85oldggp=48870 newggp=48873
86
87Note that reading from this file provokes a synchronize_rcu(). The
88"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
89executing the synchronize_rcu(), and the "newggp" value is also the
90"ggp" value, but taken after the synchronize_rcu() command returns.
91
92
93The output of "cat rcu/rcugp" looks as follows:
94
95na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
961=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
97z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
98
99These are counters tracking internal preemptable-RCU events, however,
100some of them may be useful for debugging algorithms using RCU. In
101particular, the "nl", "wl", and "dl" values track the number of RCU
102callbacks in various states. The fields are as follows:
103
104o "na" is the total number of RCU callbacks that have been enqueued
105 since boot.
106
107o "nl" is the number of RCU callbacks waiting for the previous
108 grace period to end so that they can start waiting on the next
109 grace period.
110
111o "wa" is the total number of RCU callbacks that have started waiting
112 for a grace period since boot. "na" should be roughly equal to
113 "nl" plus "wa".
114
115o "wl" is the number of RCU callbacks currently waiting for their
116 grace period to end.
117
118o "da" is the total number of RCU callbacks whose grace periods
119 have completed since boot. "wa" should be roughly equal to
120 "wl" plus "da".
121
122o "dr" is the total number of RCU callbacks that have been removed
123 from the list of callbacks ready to invoke. "dr" should be roughly
124 equal to "da".
125
126o "di" is the total number of RCU callbacks that have been invoked
127 since boot. "di" should be roughly equal to "da", though some
128 early versions of preemptable RCU had a bug so that only the
129 last CPU's count of invocations was displayed, rather than the
130 sum of all CPU's counts.
131
132o "1" is the number of calls to rcu_try_flip(). This should be
133 roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
134 described below. In other words, the number of times that
135 the state machine is visited should be equal to the sum of the
136 number of times that each state is visited plus the number of
137 times that the state-machine lock acquisition failed.
138
139o "e1" is the number of times that rcu_try_flip() was unable to
140 acquire the fliplock.
141
142o "i1" is the number of calls to rcu_try_flip_idle().
143
144o "ie1" is the number of times rcu_try_flip_idle() exited early
145 due to the calling CPU having no work for RCU.
146
147o "g1" is the number of times that rcu_try_flip_idle() decided
148 to start a new grace period. "i1" should be roughly equal to
149 "ie1" plus "g1".
150
151o "a1" is the number of calls to rcu_try_flip_waitack().
152
153o "ae1" is the number of times that rcu_try_flip_waitack() found
154 that at least one CPU had not yet acknowledge the new grace period
155 (AKA "counter flip").
156
157o "a2" is the number of time rcu_try_flip_waitack() found that
158 all CPUs had acknowledged. "a1" should be roughly equal to
159 "ae1" plus "a2". (This particular output was collected on
160 a 128-CPU machine, hence the smaller-than-usual fraction of
161 calls to rcu_try_flip_waitack() finding all CPUs having already
162 acknowledged.)
163
164o "z1" is the number of calls to rcu_try_flip_waitzero().
165
166o "ze1" is the number of times that rcu_try_flip_waitzero() found
167 that not all of the old RCU read-side critical sections had
168 completed.
169
170o "z2" is the number of times that rcu_try_flip_waitzero() finds
171 the sum of the counters equal to zero, in other words, that
172 all of the old RCU read-side critical sections had completed.
173 The value of "z1" should be roughly equal to "ze1" plus
174 "z2".
175
176o "m1" is the number of calls to rcu_try_flip_waitmb().
177
178o "me1" is the number of times that rcu_try_flip_waitmb() finds
179 that at least one CPU has not yet executed a memory barrier.
180
181o "m2" is the number of times that rcu_try_flip_waitmb() finds that
182 all CPUs have executed a memory barrier.
183
184
185Hierarchical RCU debugfs Files and Formats
186
187This implementation of RCU provides three debugfs files under the
188top-level directory RCU: rcu/rcudata (which displays fields in struct
189rcu_data), rcu/rcugp (which displays grace-period counters), and
190rcu/rcuhier (which displays the struct rcu_node hierarchy).
191
192The output of "cat rcu/rcudata" looks as follows:
193
194rcu:
195 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
196 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
197 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
198 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
199 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
200 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
201 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
202 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
203rcu_bh:
204 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
205 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
206 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
207 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
208 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
209 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
210 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
211 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
212
213The first section lists the rcu_data structures for rcu, the second for
214rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system.
215The fields are as follows:
216
217o The number at the beginning of each line is the CPU number.
218 CPUs numbers followed by an exclamation mark are offline,
219 but have been online at least once since boot. There will be
220 no output for CPUs that have never been online, which can be
221 a good thing in the surprisingly common case where NR_CPUS is
222 substantially larger than the number of actual CPUs.
223
224o "c" is the count of grace periods that this CPU believes have
225 completed. CPUs in dynticks idle mode may lag quite a ways
226 behind, for example, CPU 4 under "rcu" above, which has slept
227 through the past 25 RCU grace periods. It is not unusual to
228 see CPUs lagging by thousands of grace periods.
229
230o "g" is the count of grace periods that this CPU believes have
231 started. Again, CPUs in dynticks idle mode may lag behind.
232 If the "c" and "g" values are equal, this CPU has already
233 reported a quiescent state for the last RCU grace period that
234 it is aware of, otherwise, the CPU believes that it owes RCU a
235 quiescent state.
236
237o "pq" indicates that this CPU has passed through a quiescent state
238 for the current grace period. It is possible for "pq" to be
239 "1" and "c" different than "g", which indicates that although
240 the CPU has passed through a quiescent state, either (1) this
241 CPU has not yet reported that fact, (2) some other CPU has not
242 yet reported for this grace period, or (3) both.
243
244o "pqc" indicates which grace period the last-observed quiescent
245 state for this CPU corresponds to. This is important for handling
246 the race between CPU 0 reporting an extended dynticks-idle
247 quiescent state for CPU 1 and CPU 1 suddenly waking up and
248 reporting its own quiescent state. If CPU 1 was the last CPU
249 for the current grace period, then the CPU that loses this race
250 will attempt to incorrectly mark CPU 1 as having checked in for
251 the next grace period!
252
253o "qp" indicates that RCU still expects a quiescent state from
254 this CPU.
255
256o "rpfq" is the number of rcu_pending() calls on this CPU required
257 to induce this CPU to invoke force_quiescent_state().
258
259o "rp" is low-order four hex digits of the count of how many times
260 rcu_pending() has been invoked on this CPU.
261
262o "dt" is the current value of the dyntick counter that is incremented
263 when entering or leaving dynticks idle state, either by the
264 scheduler or by irq. The number after the "/" is the interrupt
265 nesting depth when in dyntick-idle state, or one greater than
266 the interrupt-nesting depth otherwise.
267
268 This field is displayed only for CONFIG_NO_HZ kernels.
269
270o "dn" is the current value of the dyntick counter that is incremented
271 when entering or leaving dynticks idle state via NMI. If both
272 the "dt" and "dn" values are even, then this CPU is in dynticks
273 idle mode and may be ignored by RCU. If either of these two
274 counters is odd, then RCU must be alert to the possibility of
275 an RCU read-side critical section running on this CPU.
276
277 This field is displayed only for CONFIG_NO_HZ kernels.
278
279o "df" is the number of times that some other CPU has forced a
280 quiescent state on behalf of this CPU due to this CPU being in
281 dynticks-idle state.
282
283 This field is displayed only for CONFIG_NO_HZ kernels.
284
285o "of" is the number of times that some other CPU has forced a
286 quiescent state on behalf of this CPU due to this CPU being
287 offline. In a perfect world, this might neve happen, but it
288 turns out that offlining and onlining a CPU can take several grace
289 periods, and so there is likely to be an extended period of time
290 when RCU believes that the CPU is online when it really is not.
291 Please note that erring in the other direction (RCU believing a
292 CPU is offline when it is really alive and kicking) is a fatal
293 error, so it makes sense to err conservatively.
294
295o "ri" is the number of times that RCU has seen fit to send a
296 reschedule IPI to this CPU in order to get it to report a
297 quiescent state.
298
299o "ql" is the number of RCU callbacks currently residing on
300 this CPU. This is the total number of callbacks, regardless
301 of what state they are in (new, waiting for grace period to
302 start, waiting for grace period to end, ready to invoke).
303
304o "b" is the batch limit for this CPU. If more than this number
305 of RCU callbacks is ready to invoke, then the remainder will
306 be deferred.
307
308
309The output of "cat rcu/rcugp" looks as follows:
310
311rcu: completed=33062 gpnum=33063
312rcu_bh: completed=464 gpnum=464
313
314Again, this output is for both "rcu" and "rcu_bh". The fields are
315taken from the rcu_state structure, and are as follows:
316
317o "completed" is the number of grace periods that have completed.
318 It is comparable to the "c" field from rcu/rcudata in that a
319 CPU whose "c" field matches the value of "completed" is aware
320 that the corresponding RCU grace period has completed.
321
322o "gpnum" is the number of grace periods that have started. It is
323 comparable to the "g" field from rcu/rcudata in that a CPU
324 whose "g" field matches the value of "gpnum" is aware that the
325 corresponding RCU grace period has started.
326
327 If these two fields are equal (as they are for "rcu_bh" above),
328 then there is no grace period in progress, in other words, RCU
329 is idle. On the other hand, if the two fields differ (as they
330 do for "rcu" above), then an RCU grace period is in progress.
331
332
333The output of "cat rcu/rcuhier" looks as follows, with very long lines:
334
335c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
3361/1 0:127 ^0
3373/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3
3383/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3
339rcu_bh:
340c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
3410/1 0:127 ^0
3420/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3
3430/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3
344
345This is once again split into "rcu" and "rcu_bh" portions. The fields are
346as follows:
347
348o "c" is exactly the same as "completed" under rcu/rcugp.
349
350o "g" is exactly the same as "gpnum" under rcu/rcugp.
351
352o "s" is the "signaled" state that drives force_quiescent_state()'s
353 state machine.
354
355o "jfq" is the number of jiffies remaining for this grace period
356 before force_quiescent_state() is invoked to help push things
357 along. Note that CPUs in dyntick-idle mode thoughout the grace
358 period will not report on their own, but rather must be check by
359 some other CPU via force_quiescent_state().
360
361o "j" is the low-order four hex digits of the jiffies counter.
362 Yes, Paul did run into a number of problems that turned out to
363 be due to the jiffies counter no longer counting. Why do you ask?
364
365o "nfqs" is the number of calls to force_quiescent_state() since
366 boot.
367
368o "nfqsng" is the number of useless calls to force_quiescent_state(),
369 where there wasn't actually a grace period active. This can
370 happen due to races. The number in parentheses is the difference
371 between "nfqs" and "nfqsng", or the number of times that
372 force_quiescent_state() actually did some real work.
373
374o "fqlh" is the number of calls to force_quiescent_state() that
375 exited immediately (without even being counted in nfqs above)
376 due to contention on ->fqslock.
377
378o Each element of the form "1/1 0:127 ^0" represents one struct
379 rcu_node. Each line represents one level of the hierarchy, from
380 root to leaves. It is best to think of the rcu_data structures
381 as forming yet another level after the leaves. Note that there
382 might be either one, two, or three levels of rcu_node structures,
383 depending on the relationship between CONFIG_RCU_FANOUT and
384 CONFIG_NR_CPUS.
385
386 o The numbers separated by the "/" are the qsmask followed
387 by the qsmaskinit. The qsmask will have one bit
388 set for each entity in the next lower level that
389 has not yet checked in for the current grace period.
390 The qsmaskinit will have one bit for each entity that is
391 currently expected to check in during each grace period.
392 The value of qsmaskinit is assigned to that of qsmask
393 at the beginning of each grace period.
394
395 For example, for "rcu", the qsmask of the first entry
396 of the lowest level is 0x14, meaning that we are still
397 waiting for CPUs 2 and 4 to check in for the current
398 grace period.
399
400 o The numbers separated by the ":" are the range of CPUs
401 served by this struct rcu_node. This can be helpful
402 in working out how the hierarchy is wired together.
403
404 For example, the first entry at the lowest level shows
405 "0:5", indicating that it covers CPUs 0 through 5.
406
407 o The number after the "^" indicates the bit in the
408 next higher level rcu_node structure that this
409 rcu_node structure corresponds to.
410
411 For example, the first entry at the lowest level shows
412 "^0", indicating that it corresponds to bit zero in
413 the first entry at the middle level.
diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index 4ba4664ce5c3..9cb9138f7a79 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -71,35 +71,50 @@ Look at the current lock statistics:
71 71
72# less /proc/lock_stat 72# less /proc/lock_stat
73 73
7401 lock_stat version 0.2 7401 lock_stat version 0.3
7502 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7502 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
7704 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7704 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7805 7805
7906 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 7906 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34
8007 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 8007 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88
8108 -------------------------- 8108 ---------------
8209 &inode->i_data.tree_lock 0 [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190 8209 &mm->mmap_sem 487 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8310 8310 &mm->mmap_sem 179 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
8411 ............................................................................................................................................................................................... 8411 &mm->mmap_sem 279 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8512 8512 &mm->mmap_sem 76 [<ffffffff802a490b>] sys_munmap+0x32/0x59
8613 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 8613 ---------------
8714 ----------- 8714 &mm->mmap_sem 270 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8815 dcache_lock 180 [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230 8815 &mm->mmap_sem 431 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8916 dcache_lock 165 [<ffffffff802c002a>] d_alloc+0x15a/0x210 8916 &mm->mmap_sem 138 [<ffffffff802a490b>] sys_munmap+0x32/0x59
9017 dcache_lock 33 [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70 9017 &mm->mmap_sem 145 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
9118 dcache_lock 1 [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130 9118
9219 ...............................................................................................................................................................................................
9320
9421 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41
9522 -----------
9623 dcache_lock 179 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
9724 dcache_lock 113 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
9825 dcache_lock 99 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
9926 dcache_lock 104 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
10027 -----------
10128 dcache_lock 192 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
10229 dcache_lock 98 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
10330 dcache_lock 72 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
10431 dcache_lock 112 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
92 105
93This excerpt shows the first two lock class statistics. Line 01 shows the 106This excerpt shows the first two lock class statistics. Line 01 shows the
94output version - each time the format changes this will be updated. Line 02-04 107output version - each time the format changes this will be updated. Line 02-04
95show the header with column descriptions. Lines 05-10 and 13-18 show the actual 108show the header with column descriptions. Lines 05-18 and 20-31 show the actual
96statistics. These statistics come in two parts; the actual stats separated by a 109statistics. These statistics come in two parts; the actual stats separated by a
97short separator (line 08, 14) from the contention points. 110short separator (line 08, 13) from the contention points.
98 111
99The first lock (05-10) is a read/write lock, and shows two lines above the 112The first lock (05-18) is a read/write lock, and shows two lines above the
100short separator. The contention points don't match the column descriptors, 113short separator. The contention points don't match the column descriptors,
101they have two: contentions and [<IP>] symbol. 114they have two: contentions and [<IP>] symbol. The second set of contention
115points are the points we're contending with.
102 116
117The integer part of the time values is in us.
103 118
104View the top contending locks: 119View the top contending locks:
105 120
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index f4e55be2eea9..afad9f5ac0ac 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
208 break; 208 break;
209 case ERR_TYPE_KERNEL_PANIC: 209 case ERR_TYPE_KERNEL_PANIC:
210 default: 210 default:
211 WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
211 spin_unlock_irqrestore(&rtasd_log_lock, s); 212 spin_unlock_irqrestore(&rtasd_log_lock, s);
212 return; 213 return;
213 } 214 }
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
227 /* Check to see if we need to or have stopped logging */ 228 /* Check to see if we need to or have stopped logging */
228 if (fatal || !logging_enabled) { 229 if (fatal || !logging_enabled) {
229 logging_enabled = 0; 230 logging_enabled = 0;
231 WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
230 spin_unlock_irqrestore(&rtasd_log_lock, s); 232 spin_unlock_irqrestore(&rtasd_log_lock, s);
231 return; 233 return;
232 } 234 }
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
249 else 251 else
250 rtas_log_start += 1; 252 rtas_log_start += 1;
251 253
254 WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
252 spin_unlock_irqrestore(&rtasd_log_lock, s); 255 spin_unlock_irqrestore(&rtasd_log_lock, s);
253 wake_up_interruptible(&rtas_log_wait); 256 wake_up_interruptible(&rtas_log_wait);
254 break; 257 break;
255 case ERR_TYPE_KERNEL_PANIC: 258 case ERR_TYPE_KERNEL_PANIC:
256 default: 259 default:
260 WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
257 spin_unlock_irqrestore(&rtasd_log_lock, s); 261 spin_unlock_irqrestore(&rtasd_log_lock, s);
258 return; 262 return;
259 } 263 }
diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h
index 753346e2cdfd..ae5f94d6317d 100644
--- a/arch/um/include/asm/system.h
+++ b/arch/um/include/asm/system.h
@@ -11,21 +11,21 @@ extern int get_signals(void);
11extern void block_signals(void); 11extern void block_signals(void);
12extern void unblock_signals(void); 12extern void unblock_signals(void);
13 13
14#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ 14#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
15 (flags) = get_signals(); } while(0) 15 (flags) = get_signals(); } while(0)
16#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ 16#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
17 set_signals(flags); } while(0) 17 set_signals(flags); } while(0)
18 18
19#define local_irq_save(flags) do { local_save_flags(flags); \ 19#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
20 local_irq_disable(); } while(0) 20 raw_local_irq_disable(); } while(0)
21 21
22#define local_irq_enable() unblock_signals() 22#define raw_local_irq_enable() unblock_signals()
23#define local_irq_disable() block_signals() 23#define raw_local_irq_disable() block_signals()
24 24
25#define irqs_disabled() \ 25#define irqs_disabled() \
26({ \ 26({ \
27 unsigned long flags; \ 27 unsigned long flags; \
28 local_save_flags(flags); \ 28 raw_local_save_flags(flags); \
29 (flags == 0); \ 29 (flags == 0); \
30}) 30})
31 31
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index dc22c0733282..4035357f5b9d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -65,7 +65,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
65 return dma_ops; 65 return dma_ops;
66 else 66 else
67 return dev->archdata.dma_ops; 67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */ 68#endif
69} 69}
70 70
71/* Make sure we keep the same behaviour */ 71/* Make sure we keep the same behaviour */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 295b13193f4d..a6ee9e6f530f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -7,8 +7,6 @@ extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9 9
10extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
11
12/* 10 seconds */ 10/* 10 seconds */
13#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
14 12
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 647781298e7e..66834c41c049 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -84,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
84static inline void early_quirks(void) { } 84static inline void early_quirks(void) { }
85#endif 85#endif
86 86
87extern void pci_iommu_alloc(void);
88
87#endif /* __KERNEL__ */ 89#endif /* __KERNEL__ */
88 90
89#ifdef CONFIG_X86_32 91#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index d02d936840a3..4da207982777 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void); 25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27 26
28/* The PCI address space does equal the physical memory 27/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use 28 * address space. The networking and block device layers use
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 580c3ee6c58c..4340055b7559 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
157 int __ret_gu; \ 157 int __ret_gu; \
158 unsigned long __val_gu; \ 158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \ 159 __chk_user_ptr(ptr); \
160 might_fault(); \
160 switch (sizeof(*(ptr))) { \ 161 switch (sizeof(*(ptr))) { \
161 case 1: \ 162 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
241 int __ret_pu; \ 242 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \ 243 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \ 244 __chk_user_ptr(ptr); \
245 might_fault(); \
244 __pu_val = x; \ 246 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \ 247 switch (sizeof(*(ptr))) { \
246 case 1: \ 248 case 1: \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
82static __always_inline unsigned long __must_check 82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n) 83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{ 84{
85 might_sleep(); 85 might_fault();
86 return __copy_to_user_inatomic(to, from, n); 86 return __copy_to_user_inatomic(to, from, n);
87} 87}
88 88
89static __always_inline unsigned long 89static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
137static __always_inline unsigned long 137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n) 138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{ 139{
140 might_sleep(); 140 might_fault();
141 if (__builtin_constant_p(n)) { 141 if (__builtin_constant_p(n)) {
142 unsigned long ret; 142 unsigned long ret;
143 143
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_sleep(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
164 unsigned long ret; 164 unsigned long ret;
165 165
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 30{
31 int ret = 0; 31 int ret = 0;
32
33 might_fault();
32 if (!__builtin_constant_p(size)) 34 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size); 35 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) { 36 switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size) 73int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{ 74{
73 int ret = 0; 75 int ret = 0;
76
77 might_fault();
74 if (!__builtin_constant_p(size)) 78 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size); 79 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) { 80 switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{ 118{
115 int ret = 0; 119 int ret = 0;
120
121 might_fault();
116 if (!__builtin_constant_p(size)) 122 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst, 123 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size); 124 (__force void *)src, size);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88dd768eab6d..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,6 +109,8 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109 109
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111 111
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
113
112### 114###
113# 64 bit specific files 115# 64 bit specific files
114ifeq ($(CONFIG_X86_64),y) 116ifeq ($(CONFIG_X86_64),y)
@@ -122,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
122 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
123 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
124 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
125 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
126 127
127 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
128endif 129endif
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 7a3dfceb90e4..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -101,11 +101,15 @@ static void __init dma32_free_bootmem(void)
101 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
102 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
103} 103}
104#endif
104 105
105void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
106{ 107{
108#ifdef CONFIG_X86_64
107 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
108 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
109 /* 113 /*
110 * The order of these functions is important for 114 * The order of these functions is important for
111 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -121,15 +125,6 @@ void __init pci_iommu_alloc(void)
121 pci_swiotlb_init(); 125 pci_swiotlb_init();
122} 126}
123 127
124unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
125{
126 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
127
128 return size >> PAGE_SHIFT;
129}
130EXPORT_SYMBOL(iommu_nr_pages);
131#endif
132
133void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
134 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
135{ 130{
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
39#define __do_strncpy_from_user(dst, src, count, res) \ 39#define __do_strncpy_from_user(dst, src, count, res) \
40do { \ 40do { \
41 int __d0, __d1, __d2; \ 41 int __d0, __d1, __d2; \
42 might_sleep(); \ 42 might_fault(); \
43 __asm__ __volatile__( \ 43 __asm__ __volatile__( \
44 " testl %1,%1\n" \ 44 " testl %1,%1\n" \
45 " jz 2f\n" \ 45 " jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
126#define __do_clear_user(addr,size) \ 126#define __do_clear_user(addr,size) \
127do { \ 127do { \
128 int __d0; \ 128 int __d0; \
129 might_sleep(); \ 129 might_fault(); \
130 __asm__ __volatile__( \ 130 __asm__ __volatile__( \
131 "0: rep; stosl\n" \ 131 "0: rep; stosl\n" \
132 " movl %2,%0\n" \ 132 " movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
155unsigned long 155unsigned long
156clear_user(void __user *to, unsigned long n) 156clear_user(void __user *to, unsigned long n)
157{ 157{
158 might_sleep(); 158 might_fault();
159 if (access_ok(VERIFY_WRITE, to, n)) 159 if (access_ok(VERIFY_WRITE, to, n))
160 __do_clear_user(to, n); 160 __do_clear_user(to, n);
161 return n; 161 return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
197 unsigned long mask = -__addr_ok(s); 197 unsigned long mask = -__addr_ok(s);
198 unsigned long res, tmp; 198 unsigned long res, tmp;
199 199
200 might_sleep(); 200 might_fault();
201 201
202 __asm__ __volatile__( 202 __asm__ __volatile__(
203 " testl %0, %0\n" 203 " testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
15#define __do_strncpy_from_user(dst,src,count,res) \ 15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \ 16do { \
17 long __d0, __d1, __d2; \ 17 long __d0, __d1, __d2; \
18 might_sleep(); \ 18 might_fault(); \
19 __asm__ __volatile__( \ 19 __asm__ __volatile__( \
20 " testq %1,%1\n" \ 20 " testq %1,%1\n" \
21 " jz 2f\n" \ 21 " jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
64unsigned long __clear_user(void __user *addr, unsigned long size) 64unsigned long __clear_user(void __user *addr, unsigned long size)
65{ 65{
66 long __d0; 66 long __d0;
67 might_sleep(); 67 might_fault();
68 /* no memory constraint because it doesn't change any memory gcc knows 68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */ 69 about */
70 asm volatile( 70 asm volatile(
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 800e1d94c1b5..8655b5bb0963 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/pci.h>
24#include <linux/pfn.h> 25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
@@ -967,6 +968,8 @@ void __init mem_init(void)
967 int codesize, reservedpages, datasize, initsize; 968 int codesize, reservedpages, datasize, initsize;
968 int tmp; 969 int tmp;
969 970
971 pci_iommu_alloc();
972
970#ifdef CONFIG_FLATMEM 973#ifdef CONFIG_FLATMEM
971 BUG_ON(!mem_map); 974 BUG_ON(!mem_map);
972#endif 975#endif
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 4c794d73fb84..8af276361bf2 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -41,15 +41,14 @@ struct bug_entry {
41 41
42#ifndef __WARN 42#ifndef __WARN
43#ifndef __ASSEMBLY__ 43#ifndef __ASSEMBLY__
44extern void warn_on_slowpath(const char *file, const int line);
45extern void warn_slowpath(const char *file, const int line, 44extern void warn_slowpath(const char *file, const int line,
46 const char *fmt, ...) __attribute__((format(printf, 3, 4))); 45 const char *fmt, ...) __attribute__((format(printf, 3, 4)));
47#define WANT_WARN_ON_SLOWPATH 46#define WANT_WARN_ON_SLOWPATH
48#endif 47#endif
49#define __WARN() warn_on_slowpath(__FILE__, __LINE__) 48#define __WARN() warn_slowpath(__FILE__, __LINE__, NULL)
50#define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg) 49#define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg)
51#else 50#else
52#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0) 51#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0)
53#endif 52#endif
54 53
55#ifndef WARN_ON 54#ifndef WARN_ON
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 777dbf695d44..27b1bcffe408 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
2#define _LINUX_BH_H 2#define _LINUX_BH_H
3 3
4extern void local_bh_disable(void); 4extern void local_bh_disable(void);
5extern void __local_bh_enable(void);
6extern void _local_bh_enable(void); 5extern void _local_bh_enable(void);
7extern void local_bh_enable(void); 6extern void local_bh_enable(void);
8extern void local_bh_enable_ip(unsigned long ip); 7extern void local_bh_enable_ip(unsigned long ip);
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 4aaa4afb1cb9..096476f1fb35 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
17({ \ 17({ \
18 int __ret = 0; \ 18 int __ret = 0; \
19 \ 19 \
20 if (unlikely(c)) { \ 20 if (!oops_in_progress && unlikely(c)) { \
21 if (debug_locks_off() && !debug_locks_silent) \ 21 if (debug_locks_off() && !debug_locks_silent) \
22 WARN_ON(1); \ 22 WARN_ON(1); \
23 __ret = 1; \ 23 __ret = 1; \
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 586ab56a3ec3..3bf5bb5a34f9 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -25,7 +25,8 @@ union ktime;
25#define FUTEX_WAKE_BITSET 10 25#define FUTEX_WAKE_BITSET 10
26 26
27#define FUTEX_PRIVATE_FLAG 128 27#define FUTEX_PRIVATE_FLAG 128
28#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG 28#define FUTEX_CLOCK_REALTIME 256
29#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
29 30
30#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) 31#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
31#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) 32#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -164,6 +165,8 @@ union futex_key {
164 } both; 165 } both;
165}; 166};
166 167
168#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
169
167#ifdef CONFIG_FUTEX 170#ifdef CONFIG_FUTEX
168extern void exit_robust_list(struct task_struct *curr); 171extern void exit_robust_list(struct task_struct *curr);
169extern void exit_pi_state_list(struct task_struct *curr); 172extern void exit_pi_state_list(struct task_struct *curr);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 89a56d79e4c6..f83288347dda 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -119,13 +119,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
119} 119}
120#endif 120#endif
121 121
122#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) 122#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
123extern void rcu_irq_enter(void); 123extern void rcu_irq_enter(void);
124extern void rcu_irq_exit(void); 124extern void rcu_irq_exit(void);
125extern void rcu_nmi_enter(void);
126extern void rcu_nmi_exit(void);
125#else 127#else
126# define rcu_irq_enter() do { } while (0) 128# define rcu_irq_enter() do { } while (0)
127# define rcu_irq_exit() do { } while (0) 129# define rcu_irq_exit() do { } while (0)
128#endif /* CONFIG_PREEMPT_RCU */ 130# define rcu_nmi_enter() do { } while (0)
131# define rcu_nmi_exit() do { } while (0)
132#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
129 133
130/* 134/*
131 * It is safe to do non-atomic ops on ->hardirq_context, 135 * It is safe to do non-atomic ops on ->hardirq_context,
@@ -135,7 +139,6 @@ extern void rcu_irq_exit(void);
135 */ 139 */
136#define __irq_enter() \ 140#define __irq_enter() \
137 do { \ 141 do { \
138 rcu_irq_enter(); \
139 account_system_vtime(current); \ 142 account_system_vtime(current); \
140 add_preempt_count(HARDIRQ_OFFSET); \ 143 add_preempt_count(HARDIRQ_OFFSET); \
141 trace_hardirq_enter(); \ 144 trace_hardirq_enter(); \
@@ -154,7 +157,6 @@ extern void irq_enter(void);
154 trace_hardirq_exit(); \ 157 trace_hardirq_exit(); \
155 account_system_vtime(current); \ 158 account_system_vtime(current); \
156 sub_preempt_count(HARDIRQ_OFFSET); \ 159 sub_preempt_count(HARDIRQ_OFFSET); \
157 rcu_irq_exit(); \
158 } while (0) 160 } while (0)
159 161
160/* 162/*
@@ -166,11 +168,14 @@ extern void irq_exit(void);
166 do { \ 168 do { \
167 ftrace_nmi_enter(); \ 169 ftrace_nmi_enter(); \
168 lockdep_off(); \ 170 lockdep_off(); \
171 rcu_nmi_enter(); \
169 __irq_enter(); \ 172 __irq_enter(); \
170 } while (0) 173 } while (0)
174
171#define nmi_exit() \ 175#define nmi_exit() \
172 do { \ 176 do { \
173 __irq_exit(); \ 177 __irq_exit(); \
178 rcu_nmi_exit(); \
174 lockdep_on(); \ 179 lockdep_on(); \
175 ftrace_nmi_exit(); \ 180 ftrace_nmi_exit(); \
176 } while (0) 181 } while (0)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6002ae76785c..ca9ff6411dfa 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -141,6 +141,15 @@ extern int _cond_resched(void);
141 (__x < 0) ? -__x : __x; \ 141 (__x < 0) ? -__x : __x; \
142 }) 142 })
143 143
144#ifdef CONFIG_PROVE_LOCKING
145void might_fault(void);
146#else
147static inline void might_fault(void)
148{
149 might_sleep();
150}
151#endif
152
144extern struct atomic_notifier_head panic_notifier_list; 153extern struct atomic_notifier_head panic_notifier_list;
145extern long (*panic_blink)(long time); 154extern long (*panic_blink)(long time);
146NORET_TYPE void panic(const char * fmt, ...) 155NORET_TYPE void panic(const char * fmt, ...)
@@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr);
188extern int core_kernel_text(unsigned long addr); 197extern int core_kernel_text(unsigned long addr);
189extern int __kernel_text_address(unsigned long addr); 198extern int __kernel_text_address(unsigned long addr);
190extern int kernel_text_address(unsigned long addr); 199extern int kernel_text_address(unsigned long addr);
200extern int func_ptr_is_kernel_text(void *ptr);
201
191struct pid; 202struct pid;
192extern struct pid *session_of_pgrp(struct pid *pgrp); 203extern struct pid *session_of_pgrp(struct pid *pgrp);
193 204
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 29aec6e10020..37a0361f4685 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -73,6 +73,8 @@ struct lock_class_key {
73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; 73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
74}; 74};
75 75
76#define LOCKSTAT_POINTS 4
77
76/* 78/*
77 * The lock-class itself: 79 * The lock-class itself:
78 */ 80 */
@@ -119,7 +121,8 @@ struct lock_class {
119 int name_version; 121 int name_version;
120 122
121#ifdef CONFIG_LOCK_STAT 123#ifdef CONFIG_LOCK_STAT
122 unsigned long contention_point[4]; 124 unsigned long contention_point[LOCKSTAT_POINTS];
125 unsigned long contending_point[LOCKSTAT_POINTS];
123#endif 126#endif
124}; 127};
125 128
@@ -144,6 +147,7 @@ enum bounce_type {
144 147
145struct lock_class_stats { 148struct lock_class_stats {
146 unsigned long contention_point[4]; 149 unsigned long contention_point[4];
150 unsigned long contending_point[4];
147 struct lock_time read_waittime; 151 struct lock_time read_waittime;
148 struct lock_time write_waittime; 152 struct lock_time write_waittime;
149 struct lock_time read_holdtime; 153 struct lock_time read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
165 const char *name; 169 const char *name;
166#ifdef CONFIG_LOCK_STAT 170#ifdef CONFIG_LOCK_STAT
167 int cpu; 171 int cpu;
172 unsigned long ip;
168#endif 173#endif
169}; 174};
170 175
@@ -309,8 +314,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
309extern void lock_release(struct lockdep_map *lock, int nested, 314extern void lock_release(struct lockdep_map *lock, int nested,
310 unsigned long ip); 315 unsigned long ip);
311 316
312extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, 317extern void lock_set_class(struct lockdep_map *lock, const char *name,
313 unsigned long ip); 318 struct lock_class_key *key, unsigned int subclass,
319 unsigned long ip);
320
321static inline void lock_set_subclass(struct lockdep_map *lock,
322 unsigned int subclass, unsigned long ip)
323{
324 lock_set_class(lock, lock->name, lock->key, subclass, ip);
325}
314 326
315# define INIT_LOCKDEP .lockdep_recursion = 0, 327# define INIT_LOCKDEP .lockdep_recursion = 0,
316 328
@@ -328,6 +340,7 @@ static inline void lockdep_on(void)
328 340
329# define lock_acquire(l, s, t, r, c, n, i) do { } while (0) 341# define lock_acquire(l, s, t, r, c, n, i) do { } while (0)
330# define lock_release(l, n, i) do { } while (0) 342# define lock_release(l, n, i) do { } while (0)
343# define lock_set_class(l, n, k, s, i) do { } while (0)
331# define lock_set_subclass(l, s, i) do { } while (0) 344# define lock_set_subclass(l, s, i) do { } while (0)
332# define lockdep_init() do { } while (0) 345# define lockdep_init() do { } while (0)
333# define lockdep_info() do { } while (0) 346# define lockdep_info() do { } while (0)
@@ -356,7 +369,7 @@ struct lock_class_key { };
356#ifdef CONFIG_LOCK_STAT 369#ifdef CONFIG_LOCK_STAT
357 370
358extern void lock_contended(struct lockdep_map *lock, unsigned long ip); 371extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
359extern void lock_acquired(struct lockdep_map *lock); 372extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
360 373
361#define LOCK_CONTENDED(_lock, try, lock) \ 374#define LOCK_CONTENDED(_lock, try, lock) \
362do { \ 375do { \
@@ -364,13 +377,13 @@ do { \
364 lock_contended(&(_lock)->dep_map, _RET_IP_); \ 377 lock_contended(&(_lock)->dep_map, _RET_IP_); \
365 lock(_lock); \ 378 lock(_lock); \
366 } \ 379 } \
367 lock_acquired(&(_lock)->dep_map); \ 380 lock_acquired(&(_lock)->dep_map, _RET_IP_); \
368} while (0) 381} while (0)
369 382
370#else /* CONFIG_LOCK_STAT */ 383#else /* CONFIG_LOCK_STAT */
371 384
372#define lock_contended(lockdep_map, ip) do {} while (0) 385#define lock_contended(lockdep_map, ip) do {} while (0)
373#define lock_acquired(lockdep_map) do {} while (0) 386#define lock_acquired(lockdep_map, ip) do {} while (0)
374 387
375#define LOCK_CONTENDED(_lock, try, lock) \ 388#define LOCK_CONTENDED(_lock, try, lock) \
376 lock(_lock) 389 lock(_lock)
@@ -481,4 +494,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
481# define lock_map_release(l) do { } while (0) 494# define lock_map_release(l) do { } while (0)
482#endif 495#endif
483 496
497#ifdef CONFIG_PROVE_LOCKING
498# define might_lock(lock) \
499do { \
500 typecheck(struct lockdep_map *, &(lock)->dep_map); \
501 lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \
502 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
503} while (0)
504# define might_lock_read(lock) \
505do { \
506 typecheck(struct lockdep_map *, &(lock)->dep_map); \
507 lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \
508 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
509} while (0)
510#else
511# define might_lock(lock) do { } while (0)
512# define might_lock_read(lock) do { } while (0)
513#endif
514
484#endif /* __LINUX_LOCKDEP_H */ 515#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee0..7a0e5c4f8072 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
144/* 144/*
145 * NOTE: mutex_trylock() follows the spin_trylock() convention, 145 * NOTE: mutex_trylock() follows the spin_trylock() convention,
146 * not the down_trylock() convention! 146 * not the down_trylock() convention!
147 *
148 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
147 */ 149 */
148extern int mutex_trylock(struct mutex *lock); 150extern int mutex_trylock(struct mutex *lock);
149extern void mutex_unlock(struct mutex *lock); 151extern void mutex_unlock(struct mutex *lock);
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 5f89b62e6983..301dda829e37 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -41,7 +41,7 @@
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42 42
43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
44#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ 44#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ 45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
47 47
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 895dc9c1088c..1168fbcea8d4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
52 void (*func)(struct rcu_head *head); 52 void (*func)(struct rcu_head *head);
53}; 53};
54 54
55#ifdef CONFIG_CLASSIC_RCU 55#if defined(CONFIG_CLASSIC_RCU)
56#include <linux/rcuclassic.h> 56#include <linux/rcuclassic.h>
57#else /* #ifdef CONFIG_CLASSIC_RCU */ 57#elif defined(CONFIG_TREE_RCU)
58#include <linux/rcutree.h>
59#elif defined(CONFIG_PREEMPT_RCU)
58#include <linux/rcupreempt.h> 60#include <linux/rcupreempt.h>
59#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ 61#else
62#error "Unknown RCU implementation specified to kernel configuration"
63#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
60 64
61#define RCU_HEAD_INIT { .next = NULL, .func = NULL } 65#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
62#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT 66#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 000000000000..d4368b7975c3
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,329 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Dipankar Sarma <dipankar@in.ibm.com>
21 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 *
26 * For detailed explanation of Read-Copy Update mechanism see -
27 * Documentation/RCU
28 */
29
30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H
32
33#include <linux/cache.h>
34#include <linux/spinlock.h>
35#include <linux/threads.h>
36#include <linux/percpu.h>
37#include <linux/cpumask.h>
38#include <linux/seqlock.h>
39
40/*
41 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
42 * In theory, it should be possible to add more levels straightforwardly.
43 * In practice, this has not been tested, so there is probably some
44 * bug somewhere.
45 */
46#define MAX_RCU_LVLS 3
47#define RCU_FANOUT (CONFIG_RCU_FANOUT)
48#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
49#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
50
51#if NR_CPUS <= RCU_FANOUT
52# define NUM_RCU_LVLS 1
53# define NUM_RCU_LVL_0 1
54# define NUM_RCU_LVL_1 (NR_CPUS)
55# define NUM_RCU_LVL_2 0
56# define NUM_RCU_LVL_3 0
57#elif NR_CPUS <= RCU_FANOUT_SQ
58# define NUM_RCU_LVLS 2
59# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
61# define NUM_RCU_LVL_2 (NR_CPUS)
62# define NUM_RCU_LVL_3 0
63#elif NR_CPUS <= RCU_FANOUT_CUBE
64# define NUM_RCU_LVLS 3
65# define NUM_RCU_LVL_0 1
66# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
67# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
68# define NUM_RCU_LVL_3 NR_CPUS
69#else
70# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
71#endif /* #if (NR_CPUS) <= RCU_FANOUT */
72
73#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
74#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
75
76/*
77 * Dynticks per-CPU state.
78 */
79struct rcu_dynticks {
80 int dynticks_nesting; /* Track nesting level, sort of. */
81 int dynticks; /* Even value for dynticks-idle, else odd. */
82 int dynticks_nmi; /* Even value for either dynticks-idle or */
83 /* not in nmi handler, else odd. So this */
84 /* remains even for nmi from irq handler. */
85};
86
87/*
88 * Definition for node within the RCU grace-period-detection hierarchy.
89 */
90struct rcu_node {
91 spinlock_t lock;
92 unsigned long qsmask; /* CPUs or groups that need to switch in */
93 /* order for current grace period to proceed.*/
94 unsigned long qsmaskinit;
95 /* Per-GP initialization for qsmask. */
96 unsigned long grpmask; /* Mask to apply to parent qsmask. */
97 int grplo; /* lowest-numbered CPU or group here. */
98 int grphi; /* highest-numbered CPU or group here. */
99 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */
101 struct rcu_node *parent;
102} ____cacheline_internodealigned_in_smp;
103
104/* Index values for nxttail array in struct rcu_data. */
105#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
106#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
107#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
108#define RCU_NEXT_TAIL 3
109#define RCU_NEXT_SIZE 4
110
111/* Per-CPU data for read-copy update. */
112struct rcu_data {
113 /* 1) quiescent-state and grace-period handling : */
114 long completed; /* Track rsp->completed gp number */
115 /* in order to detect GP end. */
116 long gpnum; /* Highest gp number that this CPU */
117 /* is aware of having started. */
118 long passed_quiesc_completed;
119 /* Value of completed at time of qs. */
120 bool passed_quiesc; /* User-mode/idle loop etc. */
121 bool qs_pending; /* Core waits for quiesc state. */
122 bool beenonline; /* CPU online at least once. */
123 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
124 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
125
126 /* 2) batch handling */
127 /*
128 * If nxtlist is not NULL, it is partitioned as follows.
129 * Any of the partitions might be empty, in which case the
130 * pointer to that partition will be equal to the pointer for
131 * the following partition. When the list is empty, all of
132 * the nxttail elements point to nxtlist, which is NULL.
133 *
134 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
135 * Entries that might have arrived after current GP ended
136 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
137 * Entries known to have arrived before current GP ended
138 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
139 * Entries that batch # <= ->completed - 1: waiting for current GP
140 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
141 * Entries that batch # <= ->completed
142 * The grace period for these entries has completed, and
143 * the other grace-period-completed entries may be moved
144 * here temporarily in rcu_process_callbacks().
145 */
146 struct rcu_head *nxtlist;
147 struct rcu_head **nxttail[RCU_NEXT_SIZE];
148 long qlen; /* # of queued callbacks */
149 long blimit; /* Upper limit on a processed batch */
150
151#ifdef CONFIG_NO_HZ
152 /* 3) dynticks interface. */
153 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
154 int dynticks_snap; /* Per-GP tracking for dynticks. */
155 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
156#endif /* #ifdef CONFIG_NO_HZ */
157
158 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
159#ifdef CONFIG_NO_HZ
160 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
161#endif /* #ifdef CONFIG_NO_HZ */
162 unsigned long offline_fqs; /* Kicked due to being offline. */
163 unsigned long resched_ipi; /* Sent a resched IPI. */
164
165 /* 5) state to allow this CPU to force_quiescent_state on others */
166 long n_rcu_pending; /* rcu_pending() calls since boot. */
167 long n_rcu_pending_force_qs; /* when to force quiescent states. */
168
169 int cpu;
170};
171
172/* Values for signaled field in struct rcu_state. */
173#define RCU_GP_INIT 0 /* Grace period being initialized. */
174#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
175#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
176#ifdef CONFIG_NO_HZ
177#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
178#else /* #ifdef CONFIG_NO_HZ */
179#define RCU_SIGNAL_INIT RCU_FORCE_QS
180#endif /* #else #ifdef CONFIG_NO_HZ */
181
182#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
183#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
184#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
185#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
186#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
187 /* to take at least one */
188 /* scheduling clock irq */
189 /* before ratting on them. */
190
191#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
192
193/*
194 * RCU global state, including node hierarchy. This hierarchy is
195 * represented in "heap" form in a dense array. The root (first level)
196 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
197 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
198 * and the third level in ->node[m+1] and following (->node[m+1] referenced
199 * by ->level[2]). The number of levels is determined by the number of
200 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
201 * consisting of a single rcu_node.
202 */
203struct rcu_state {
204 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
205 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
206 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
207 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
208 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
209
210 /* The following fields are guarded by the root rcu_node's lock. */
211
212 u8 signaled ____cacheline_internodealigned_in_smp;
213 /* Force QS state. */
214 long gpnum; /* Current gp number. */
215 long completed; /* # of last completed gp. */
216 spinlock_t onofflock; /* exclude on/offline and */
217 /* starting new GP. */
218 spinlock_t fqslock; /* Only one task forcing */
219 /* quiescent states. */
220 unsigned long jiffies_force_qs; /* Time at which to invoke */
221 /* force_quiescent_state(). */
222 unsigned long n_force_qs; /* Number of calls to */
223 /* force_quiescent_state(). */
224 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
225 /* due to lock unavailable. */
226 unsigned long n_force_qs_ngp; /* Number of calls leaving */
227 /* due to no GP active. */
228#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
229 unsigned long gp_start; /* Time at which GP started, */
230 /* but in jiffies. */
231 unsigned long jiffies_stall; /* Time at which to check */
232 /* for CPU stalls. */
233#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
234#ifdef CONFIG_NO_HZ
235 long dynticks_completed; /* Value of completed @ snap. */
236#endif /* #ifdef CONFIG_NO_HZ */
237};
238
239extern struct rcu_state rcu_state;
240DECLARE_PER_CPU(struct rcu_data, rcu_data);
241
242extern struct rcu_state rcu_bh_state;
243DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
244
245/*
246 * Increment the quiescent state counter.
247 * The counter is a bit degenerated: We do not need to know
248 * how many quiescent states passed, just if there was at least
249 * one since the start of the grace period. Thus just a flag.
250 */
251static inline void rcu_qsctr_inc(int cpu)
252{
253 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
254 rdp->passed_quiesc = 1;
255 rdp->passed_quiesc_completed = rdp->completed;
256}
257static inline void rcu_bh_qsctr_inc(int cpu)
258{
259 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
260 rdp->passed_quiesc = 1;
261 rdp->passed_quiesc_completed = rdp->completed;
262}
263
264extern int rcu_pending(int cpu);
265extern int rcu_needs_cpu(int cpu);
266
267#ifdef CONFIG_DEBUG_LOCK_ALLOC
268extern struct lockdep_map rcu_lock_map;
269# define rcu_read_acquire() \
270 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
271# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
272#else
273# define rcu_read_acquire() do { } while (0)
274# define rcu_read_release() do { } while (0)
275#endif
276
277static inline void __rcu_read_lock(void)
278{
279 preempt_disable();
280 __acquire(RCU);
281 rcu_read_acquire();
282}
283static inline void __rcu_read_unlock(void)
284{
285 rcu_read_release();
286 __release(RCU);
287 preempt_enable();
288}
289static inline void __rcu_read_lock_bh(void)
290{
291 local_bh_disable();
292 __acquire(RCU_BH);
293 rcu_read_acquire();
294}
295static inline void __rcu_read_unlock_bh(void)
296{
297 rcu_read_release();
298 __release(RCU_BH);
299 local_bh_enable();
300}
301
302#define __synchronize_sched() synchronize_rcu()
303
304#define call_rcu_sched(head, func) call_rcu(head, func)
305
306static inline void rcu_init_sched(void)
307{
308}
309
310extern void __rcu_init(void);
311extern void rcu_check_callbacks(int cpu, int user);
312extern void rcu_restart_cpu(int cpu);
313
314extern long rcu_batches_completed(void);
315extern long rcu_batches_completed_bh(void);
316
317#ifdef CONFIG_NO_HZ
318void rcu_enter_nohz(void);
319void rcu_exit_nohz(void);
320#else /* CONFIG_NO_HZ */
321static inline void rcu_enter_nohz(void)
322{
323}
324static inline void rcu_exit_nohz(void)
325{
326}
327#endif /* CONFIG_NO_HZ */
328
329#endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b18ec5533e8c..325af1de0351 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,9 +7,31 @@ struct device;
7struct dma_attrs; 7struct dma_attrs;
8struct scatterlist; 8struct scatterlist;
9 9
10/*
11 * Maximum allowable number of contiguous slabs to map,
12 * must be a power of 2. What is the appropriate value ?
13 * The complexity of {map,unmap}_single is linearly dependent on this value.
14 */
15#define IO_TLB_SEGSIZE 128
16
17
18/*
19 * log of the size of each IO TLB slab. The number of slabs is command line
20 * controllable.
21 */
22#define IO_TLB_SHIFT 11
23
10extern void 24extern void
11swiotlb_init(void); 25swiotlb_init(void);
12 26
27extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
28extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
29
30extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
31extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
32
33extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size);
34
13extern void 35extern void
14*swiotlb_alloc_coherent(struct device *hwdev, size_t size, 36*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
15 dma_addr_t *dma_handle, gfp_t flags); 37 dma_addr_t *dma_handle, gfp_t flags);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index fec6decfb983..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
78 \ 78 \
79 set_fs(KERNEL_DS); \ 79 set_fs(KERNEL_DS); \
80 pagefault_disable(); \ 80 pagefault_disable(); \
81 ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ 81 ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
82 pagefault_enable(); \ 82 pagefault_enable(); \
83 set_fs(old_fs); \ 83 set_fs(old_fs); \
84 ret; \ 84 ret; \
diff --git a/init/Kconfig b/init/Kconfig
index 8a63c404ef44..13627191a60d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -936,10 +936,90 @@ source "block/Kconfig"
936config PREEMPT_NOTIFIERS 936config PREEMPT_NOTIFIERS
937 bool 937 bool
938 938
939choice
940 prompt "RCU Implementation"
941 default CLASSIC_RCU
942
939config CLASSIC_RCU 943config CLASSIC_RCU
940 def_bool !PREEMPT_RCU 944 bool "Classic RCU"
941 help 945 help
942 This option selects the classic RCU implementation that is 946 This option selects the classic RCU implementation that is
943 designed for best read-side performance on non-realtime 947 designed for best read-side performance on non-realtime
944 systems. Classic RCU is the default. Note that the 948 systems.
945 PREEMPT_RCU symbol is used to select/deselect this option. 949
950 Select this option if you are unsure.
951
952config TREE_RCU
953 bool "Tree-based hierarchical RCU"
954 help
955 This option selects the RCU implementation that is
956 designed for very large SMP system with hundreds or
957 thousands of CPUs.
958
959config PREEMPT_RCU
960 bool "Preemptible RCU"
961 depends on PREEMPT
962 help
963 This option reduces the latency of the kernel by making certain
964 RCU sections preemptible. Normally RCU code is non-preemptible, if
965 this option is selected then read-only RCU sections become
966 preemptible. This helps latency, but may expose bugs due to
967 now-naive assumptions about each RCU read-side critical section
968 remaining on a given CPU through its execution.
969
970endchoice
971
972config RCU_TRACE
973 bool "Enable tracing for RCU"
974 depends on TREE_RCU || PREEMPT_RCU
975 help
976 This option provides tracing in RCU which presents stats
977 in debugfs for debugging RCU implementation.
978
979 Say Y here if you want to enable RCU tracing
980 Say N if you are unsure.
981
982config RCU_FANOUT
983 int "Tree-based hierarchical RCU fanout value"
984 range 2 64 if 64BIT
985 range 2 32 if !64BIT
986 depends on TREE_RCU
987 default 64 if 64BIT
988 default 32 if !64BIT
989 help
990 This option controls the fanout of hierarchical implementations
991 of RCU, allowing RCU to work efficiently on machines with
992 large numbers of CPUs. This value must be at least the cube
993 root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
994 systems and up to 262,144 for 64-bit systems.
995
996 Select a specific number if testing RCU itself.
997 Take the default if unsure.
998
999config RCU_FANOUT_EXACT
1000 bool "Disable tree-based hierarchical RCU auto-balancing"
1001 depends on TREE_RCU
1002 default n
1003 help
1004 This option forces use of the exact RCU_FANOUT value specified,
1005 regardless of imbalances in the hierarchy. This is useful for
1006 testing RCU itself, and might one day be useful on systems with
1007 strong NUMA behavior.
1008
1009 Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
1010
1011 Say N if unsure.
1012
1013config TREE_RCU_TRACE
1014 def_bool RCU_TRACE && TREE_RCU
1015 select DEBUG_FS
1016 help
1017 This option provides tracing for the TREE_RCU implementation,
1018 permitting Makefile to trivially select kernel/rcutree_trace.c.
1019
1020config PREEMPT_RCU_TRACE
1021 def_bool RCU_TRACE && PREEMPT_RCU
1022 select DEBUG_FS
1023 help
1024 This option provides tracing for the PREEMPT_RCU implementation,
1025 permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 027edda63511..e1c5bf3365c0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -73,10 +73,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
73obj-$(CONFIG_SECCOMP) += seccomp.o 73obj-$(CONFIG_SECCOMP) += seccomp.o
74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
76obj-$(CONFIG_TREE_RCU) += rcutree.o
76obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
77ifeq ($(CONFIG_PREEMPT_RCU),y) 78obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
78obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 79obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
79endif
80obj-$(CONFIG_RELAY) += relay.o 80obj-$(CONFIG_RELAY) += relay.o
81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca92038..a946221879d7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1328,10 +1328,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1328 * group, which consolidates times for all threads in the 1328 * group, which consolidates times for all threads in the
1329 * group including the group leader. 1329 * group including the group leader.
1330 */ 1330 */
1331 thread_group_cputime(p, &cputime);
1331 spin_lock_irq(&p->parent->sighand->siglock); 1332 spin_lock_irq(&p->parent->sighand->siglock);
1332 psig = p->parent->signal; 1333 psig = p->parent->signal;
1333 sig = p->signal; 1334 sig = p->signal;
1334 thread_group_cputime(p, &cputime);
1335 psig->cutime = 1335 psig->cutime =
1336 cputime_add(psig->cutime, 1336 cputime_add(psig->cutime,
1337 cputime_add(cputime.utime, 1337 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index feb0317cf09a..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -67,3 +67,19 @@ int kernel_text_address(unsigned long addr)
67 return 1; 67 return 1;
68 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
69} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/futex.c b/kernel/futex.c
index 4fe790e89d0f..7c6cbabe52b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr)
174 return;
175
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE:
178 iput(key->shared.inode);
179 break;
180 case FUT_OFF_MMSHARED:
181 mmdrop(key->private.mm);
182 break;
183 }
184}
185
164/** 186/**
165 * get_futex_key - Get parameters which are the keys for a futex. 187 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 188 * @uaddr: virtual address of the futex
@@ -179,12 +201,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 201 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 202 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 203 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 204static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 205{
185 unsigned long address = (unsigned long)uaddr; 206 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 207 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 208 struct page *page;
189 int err; 209 int err;
190 210
@@ -208,100 +228,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 228 return -EFAULT;
209 key->private.mm = mm; 229 key->private.mm = mm;
210 key->private.address = address; 230 key->private.address = address;
231 get_futex_key_refs(key);
211 return 0; 232 return 0;
212 } 233 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 234
221 /* 235again:
222 * Permissions. 236 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 237 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 238 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 239
240 lock_page(page);
241 if (!page->mapping) {
242 unlock_page(page);
243 put_page(page);
244 goto again;
245 }
226 246
227 /* 247 /*
228 * Private mappings are handled in a simple way. 248 * Private mappings are handled in a simple way.
229 * 249 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 250 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 251 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 252 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 253 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 254 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 255 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 256 key->private.mm = mm;
239 key->private.address = address; 257 key->private.address = address;
240 return 0; 258 } else {
259 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
260 key->shared.inode = page->mapping->host;
261 key->shared.pgoff = page->index;
241 } 262 }
242 263
243 /* 264 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 265
254 /* 266 unlock_page(page);
255 * We could walk the page table to read the non-linear 267 put_page(page);
256 * pte, and get the page index without fetching the page 268 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 269}
288 270
289/* 271static inline
290 * Drop a reference to the resource addressed by a key. 272void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 273{
295 if (!key->both.ptr) 274 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 275}
306 276
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 277static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +298,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 298
329/* 299/*
330 * Fault handling. 300 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 301 */
333static int futex_handle_fault(unsigned long address, 302static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 303{
336 struct vm_area_struct * vma; 304 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 305 struct mm_struct *mm = current->mm;
@@ -340,8 +308,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 308 if (attempt > 2)
341 return ret; 309 return ret;
342 310
343 if (!fshared) 311 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 312 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 313 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 314 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +328,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 328 current->min_flt++;
362 } 329 }
363 } 330 }
364 if (!fshared) 331 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 332 return ret;
367} 333}
368 334
@@ -385,6 +351,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 351 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 352 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 353 atomic_set(&pi_state->refcount, 1);
354 pi_state->key = FUTEX_KEY_INIT;
388 355
389 current->pi_state_cache = pi_state; 356 current->pi_state_cache = pi_state;
390 357
@@ -469,7 +436,7 @@ void exit_pi_state_list(struct task_struct *curr)
469 struct list_head *next, *head = &curr->pi_state_list; 436 struct list_head *next, *head = &curr->pi_state_list;
470 struct futex_pi_state *pi_state; 437 struct futex_pi_state *pi_state;
471 struct futex_hash_bucket *hb; 438 struct futex_hash_bucket *hb;
472 union futex_key key; 439 union futex_key key = FUTEX_KEY_INIT;
473 440
474 if (!futex_cmpxchg_enabled) 441 if (!futex_cmpxchg_enabled)
475 return; 442 return;
@@ -614,7 +581,7 @@ static void wake_futex(struct futex_q *q)
614 * The lock in wake_up_all() is a crucial memory barrier after the 581 * The lock in wake_up_all() is a crucial memory barrier after the
615 * plist_del() and also before assigning to q->lock_ptr. 582 * plist_del() and also before assigning to q->lock_ptr.
616 */ 583 */
617 wake_up_all(&q->waiters); 584 wake_up(&q->waiter);
618 /* 585 /*
619 * The waiting task can free the futex_q as soon as this is written, 586 * The waiting task can free the futex_q as soon as this is written,
620 * without taking any locks. This must come last. 587 * without taking any locks. This must come last.
@@ -726,20 +693,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
726 * Wake up all waiters hashed on the physical page that is mapped 693 * Wake up all waiters hashed on the physical page that is mapped
727 * to this virtual address: 694 * to this virtual address:
728 */ 695 */
729static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 696static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 int nr_wake, u32 bitset)
731{ 697{
732 struct futex_hash_bucket *hb; 698 struct futex_hash_bucket *hb;
733 struct futex_q *this, *next; 699 struct futex_q *this, *next;
734 struct plist_head *head; 700 struct plist_head *head;
735 union futex_key key; 701 union futex_key key = FUTEX_KEY_INIT;
736 int ret; 702 int ret;
737 703
738 if (!bitset) 704 if (!bitset)
739 return -EINVAL; 705 return -EINVAL;
740 706
741 futex_lock_mm(fshared);
742
743 ret = get_futex_key(uaddr, fshared, &key); 707 ret = get_futex_key(uaddr, fshared, &key);
744 if (unlikely(ret != 0)) 708 if (unlikely(ret != 0))
745 goto out; 709 goto out;
@@ -767,7 +731,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
767 731
768 spin_unlock(&hb->lock); 732 spin_unlock(&hb->lock);
769out: 733out:
770 futex_unlock_mm(fshared); 734 put_futex_key(fshared, &key);
771 return ret; 735 return ret;
772} 736}
773 737
@@ -776,19 +740,16 @@ out:
776 * to this virtual address: 740 * to this virtual address:
777 */ 741 */
778static int 742static int
779futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 743futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
780 u32 __user *uaddr2,
781 int nr_wake, int nr_wake2, int op) 744 int nr_wake, int nr_wake2, int op)
782{ 745{
783 union futex_key key1, key2; 746 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
784 struct futex_hash_bucket *hb1, *hb2; 747 struct futex_hash_bucket *hb1, *hb2;
785 struct plist_head *head; 748 struct plist_head *head;
786 struct futex_q *this, *next; 749 struct futex_q *this, *next;
787 int ret, op_ret, attempt = 0; 750 int ret, op_ret, attempt = 0;
788 751
789retryfull: 752retryfull:
790 futex_lock_mm(fshared);
791
792 ret = get_futex_key(uaddr1, fshared, &key1); 753 ret = get_futex_key(uaddr1, fshared, &key1);
793 if (unlikely(ret != 0)) 754 if (unlikely(ret != 0))
794 goto out; 755 goto out;
@@ -833,18 +794,12 @@ retry:
833 */ 794 */
834 if (attempt++) { 795 if (attempt++) {
835 ret = futex_handle_fault((unsigned long)uaddr2, 796 ret = futex_handle_fault((unsigned long)uaddr2,
836 fshared, attempt); 797 attempt);
837 if (ret) 798 if (ret)
838 goto out; 799 goto out;
839 goto retry; 800 goto retry;
840 } 801 }
841 802
842 /*
843 * If we would have faulted, release mmap_sem,
844 * fault it in and start all over again.
845 */
846 futex_unlock_mm(fshared);
847
848 ret = get_user(dummy, uaddr2); 803 ret = get_user(dummy, uaddr2);
849 if (ret) 804 if (ret)
850 return ret; 805 return ret;
@@ -880,7 +835,8 @@ retry:
880 if (hb1 != hb2) 835 if (hb1 != hb2)
881 spin_unlock(&hb2->lock); 836 spin_unlock(&hb2->lock);
882out: 837out:
883 futex_unlock_mm(fshared); 838 put_futex_key(fshared, &key2);
839 put_futex_key(fshared, &key1);
884 840
885 return ret; 841 return ret;
886} 842}
@@ -889,19 +845,16 @@ out:
889 * Requeue all waiters hashed on one physical page to another 845 * Requeue all waiters hashed on one physical page to another
890 * physical page. 846 * physical page.
891 */ 847 */
892static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 848static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
893 u32 __user *uaddr2,
894 int nr_wake, int nr_requeue, u32 *cmpval) 849 int nr_wake, int nr_requeue, u32 *cmpval)
895{ 850{
896 union futex_key key1, key2; 851 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
897 struct futex_hash_bucket *hb1, *hb2; 852 struct futex_hash_bucket *hb1, *hb2;
898 struct plist_head *head1; 853 struct plist_head *head1;
899 struct futex_q *this, *next; 854 struct futex_q *this, *next;
900 int ret, drop_count = 0; 855 int ret, drop_count = 0;
901 856
902 retry: 857 retry:
903 futex_lock_mm(fshared);
904
905 ret = get_futex_key(uaddr1, fshared, &key1); 858 ret = get_futex_key(uaddr1, fshared, &key1);
906 if (unlikely(ret != 0)) 859 if (unlikely(ret != 0))
907 goto out; 860 goto out;
@@ -924,12 +877,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
924 if (hb1 != hb2) 877 if (hb1 != hb2)
925 spin_unlock(&hb2->lock); 878 spin_unlock(&hb2->lock);
926 879
927 /*
928 * If we would have faulted, release mmap_sem, fault
929 * it in and start all over again.
930 */
931 futex_unlock_mm(fshared);
932
933 ret = get_user(curval, uaddr1); 880 ret = get_user(curval, uaddr1);
934 881
935 if (!ret) 882 if (!ret)
@@ -981,7 +928,8 @@ out_unlock:
981 drop_futex_key_refs(&key1); 928 drop_futex_key_refs(&key1);
982 929
983out: 930out:
984 futex_unlock_mm(fshared); 931 put_futex_key(fshared, &key2);
932 put_futex_key(fshared, &key1);
985 return ret; 933 return ret;
986} 934}
987 935
@@ -990,7 +938,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
990{ 938{
991 struct futex_hash_bucket *hb; 939 struct futex_hash_bucket *hb;
992 940
993 init_waitqueue_head(&q->waiters); 941 init_waitqueue_head(&q->waiter);
994 942
995 get_futex_key_refs(&q->key); 943 get_futex_key_refs(&q->key);
996 hb = hash_futex(&q->key); 944 hb = hash_futex(&q->key);
@@ -1103,8 +1051,7 @@ static void unqueue_me_pi(struct futex_q *q)
1103 * private futexes. 1051 * private futexes.
1104 */ 1052 */
1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1053static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1106 struct task_struct *newowner, 1054 struct task_struct *newowner, int fshared)
1107 struct rw_semaphore *fshared)
1108{ 1055{
1109 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1056 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1110 struct futex_pi_state *pi_state = q->pi_state; 1057 struct futex_pi_state *pi_state = q->pi_state;
@@ -1183,7 +1130,7 @@ retry:
1183handle_fault: 1130handle_fault:
1184 spin_unlock(q->lock_ptr); 1131 spin_unlock(q->lock_ptr);
1185 1132
1186 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1133 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1187 1134
1188 spin_lock(q->lock_ptr); 1135 spin_lock(q->lock_ptr);
1189 1136
@@ -1203,12 +1150,13 @@ handle_fault:
1203 * In case we must use restart_block to restart a futex_wait, 1150 * In case we must use restart_block to restart a futex_wait,
1204 * we encode in the 'flags' shared capability 1151 * we encode in the 'flags' shared capability
1205 */ 1152 */
1206#define FLAGS_SHARED 1 1153#define FLAGS_SHARED 0x01
1154#define FLAGS_CLOCKRT 0x02
1207 1155
1208static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1209 1157
1210static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, int fshared,
1211 u32 val, ktime_t *abs_time, u32 bitset) 1159 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1212{ 1160{
1213 struct task_struct *curr = current; 1161 struct task_struct *curr = current;
1214 DECLARE_WAITQUEUE(wait, curr); 1162 DECLARE_WAITQUEUE(wait, curr);
@@ -1225,8 +1173,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1225 q.pi_state = NULL; 1173 q.pi_state = NULL;
1226 q.bitset = bitset; 1174 q.bitset = bitset;
1227 retry: 1175 retry:
1228 futex_lock_mm(fshared); 1176 q.key = FUTEX_KEY_INIT;
1229
1230 ret = get_futex_key(uaddr, fshared, &q.key); 1177 ret = get_futex_key(uaddr, fshared, &q.key);
1231 if (unlikely(ret != 0)) 1178 if (unlikely(ret != 0))
1232 goto out_release_sem; 1179 goto out_release_sem;
@@ -1258,12 +1205,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1258 if (unlikely(ret)) { 1205 if (unlikely(ret)) {
1259 queue_unlock(&q, hb); 1206 queue_unlock(&q, hb);
1260 1207
1261 /*
1262 * If we would have faulted, release mmap_sem, fault it in and
1263 * start all over again.
1264 */
1265 futex_unlock_mm(fshared);
1266
1267 ret = get_user(uval, uaddr); 1208 ret = get_user(uval, uaddr);
1268 1209
1269 if (!ret) 1210 if (!ret)
@@ -1278,12 +1219,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1278 queue_me(&q, hb); 1219 queue_me(&q, hb);
1279 1220
1280 /* 1221 /*
1281 * Now the futex is queued and we have checked the data, we
1282 * don't want to hold mmap_sem while we sleep.
1283 */
1284 futex_unlock_mm(fshared);
1285
1286 /*
1287 * There might have been scheduling since the queue_me(), as we 1222 * There might have been scheduling since the queue_me(), as we
1288 * cannot hold a spinlock across the get_user() in case it 1223 * cannot hold a spinlock across the get_user() in case it
1289 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1224 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1294,7 +1229,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1294 1229
1295 /* add_wait_queue is the barrier after __set_current_state. */ 1230 /* add_wait_queue is the barrier after __set_current_state. */
1296 __set_current_state(TASK_INTERRUPTIBLE); 1231 __set_current_state(TASK_INTERRUPTIBLE);
1297 add_wait_queue(&q.waiters, &wait); 1232 add_wait_queue(&q.waiter, &wait);
1298 /* 1233 /*
1299 * !plist_node_empty() is safe here without any lock. 1234 * !plist_node_empty() is safe here without any lock.
1300 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1235 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1307,8 +1242,10 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1307 slack = current->timer_slack_ns; 1242 slack = current->timer_slack_ns;
1308 if (rt_task(current)) 1243 if (rt_task(current))
1309 slack = 0; 1244 slack = 0;
1310 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1245 hrtimer_init_on_stack(&t.timer,
1311 HRTIMER_MODE_ABS); 1246 clockrt ? CLOCK_REALTIME :
1247 CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1312 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1313 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1250 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1314 1251
@@ -1363,6 +1300,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 1300
1364 if (fshared) 1301 if (fshared)
1365 restart->futex.flags |= FLAGS_SHARED; 1302 restart->futex.flags |= FLAGS_SHARED;
1303 if (clockrt)
1304 restart->futex.flags |= FLAGS_CLOCKRT;
1366 return -ERESTART_RESTARTBLOCK; 1305 return -ERESTART_RESTARTBLOCK;
1367 } 1306 }
1368 1307
@@ -1370,7 +1309,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1370 queue_unlock(&q, hb); 1309 queue_unlock(&q, hb);
1371 1310
1372 out_release_sem: 1311 out_release_sem:
1373 futex_unlock_mm(fshared); 1312 put_futex_key(fshared, &q.key);
1374 return ret; 1313 return ret;
1375} 1314}
1376 1315
@@ -1378,15 +1317,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1378static long futex_wait_restart(struct restart_block *restart) 1317static long futex_wait_restart(struct restart_block *restart)
1379{ 1318{
1380 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1319 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1381 struct rw_semaphore *fshared = NULL; 1320 int fshared = 0;
1382 ktime_t t; 1321 ktime_t t;
1383 1322
1384 t.tv64 = restart->futex.time; 1323 t.tv64 = restart->futex.time;
1385 restart->fn = do_no_restart_syscall; 1324 restart->fn = do_no_restart_syscall;
1386 if (restart->futex.flags & FLAGS_SHARED) 1325 if (restart->futex.flags & FLAGS_SHARED)
1387 fshared = &current->mm->mmap_sem; 1326 fshared = 1;
1388 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1327 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1389 restart->futex.bitset); 1328 restart->futex.bitset,
1329 restart->futex.flags & FLAGS_CLOCKRT);
1390} 1330}
1391 1331
1392 1332
@@ -1396,7 +1336,7 @@ static long futex_wait_restart(struct restart_block *restart)
1396 * if there are waiters then it will block, it does PI, etc. (Due to 1336 * if there are waiters then it will block, it does PI, etc. (Due to
1397 * races the kernel might see a 0 value of the futex too.) 1337 * races the kernel might see a 0 value of the futex too.)
1398 */ 1338 */
1399static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1339static int futex_lock_pi(u32 __user *uaddr, int fshared,
1400 int detect, ktime_t *time, int trylock) 1340 int detect, ktime_t *time, int trylock)
1401{ 1341{
1402 struct hrtimer_sleeper timeout, *to = NULL; 1342 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1419,8 +1359,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1419 1359
1420 q.pi_state = NULL; 1360 q.pi_state = NULL;
1421 retry: 1361 retry:
1422 futex_lock_mm(fshared); 1362 q.key = FUTEX_KEY_INIT;
1423
1424 ret = get_futex_key(uaddr, fshared, &q.key); 1363 ret = get_futex_key(uaddr, fshared, &q.key);
1425 if (unlikely(ret != 0)) 1364 if (unlikely(ret != 0))
1426 goto out_release_sem; 1365 goto out_release_sem;
@@ -1509,7 +1448,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1509 * exit to complete. 1448 * exit to complete.
1510 */ 1449 */
1511 queue_unlock(&q, hb); 1450 queue_unlock(&q, hb);
1512 futex_unlock_mm(fshared);
1513 cond_resched(); 1451 cond_resched();
1514 goto retry; 1452 goto retry;
1515 1453
@@ -1541,12 +1479,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1541 */ 1479 */
1542 queue_me(&q, hb); 1480 queue_me(&q, hb);
1543 1481
1544 /*
1545 * Now the futex is queued and we have checked the data, we
1546 * don't want to hold mmap_sem while we sleep.
1547 */
1548 futex_unlock_mm(fshared);
1549
1550 WARN_ON(!q.pi_state); 1482 WARN_ON(!q.pi_state);
1551 /* 1483 /*
1552 * Block on the PI mutex: 1484 * Block on the PI mutex:
@@ -1559,7 +1491,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1559 ret = ret ? 0 : -EWOULDBLOCK; 1491 ret = ret ? 0 : -EWOULDBLOCK;
1560 } 1492 }
1561 1493
1562 futex_lock_mm(fshared);
1563 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1564 1495
1565 if (!ret) { 1496 if (!ret) {
@@ -1625,7 +1556,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1625 1556
1626 /* Unqueue and drop the lock */ 1557 /* Unqueue and drop the lock */
1627 unqueue_me_pi(&q); 1558 unqueue_me_pi(&q);
1628 futex_unlock_mm(fshared);
1629 1559
1630 if (to) 1560 if (to)
1631 destroy_hrtimer_on_stack(&to->timer); 1561 destroy_hrtimer_on_stack(&to->timer);
@@ -1635,34 +1565,30 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1635 queue_unlock(&q, hb); 1565 queue_unlock(&q, hb);
1636 1566
1637 out_release_sem: 1567 out_release_sem:
1638 futex_unlock_mm(fshared); 1568 put_futex_key(fshared, &q.key);
1639 if (to) 1569 if (to)
1640 destroy_hrtimer_on_stack(&to->timer); 1570 destroy_hrtimer_on_stack(&to->timer);
1641 return ret; 1571 return ret;
1642 1572
1643 uaddr_faulted: 1573 uaddr_faulted:
1644 /* 1574 /*
1645 * We have to r/w *(int __user *)uaddr, but we can't modify it 1575 * We have to r/w *(int __user *)uaddr, and we have to modify it
1646 * non-atomically. Therefore, if get_user below is not 1576 * atomically. Therefore, if we continue to fault after get_user()
1647 * enough, we need to handle the fault ourselves, while 1577 * below, we need to handle the fault ourselves, while still holding
1648 * still holding the mmap_sem. 1578 * the mmap_sem. This can occur if the uaddr is under contention as
1649 * 1579 * we have to drop the mmap_sem in order to call get_user().
1650 * ... and hb->lock. :-) --ANK
1651 */ 1580 */
1652 queue_unlock(&q, hb); 1581 queue_unlock(&q, hb);
1653 1582
1654 if (attempt++) { 1583 if (attempt++) {
1655 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1584 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1656 attempt);
1657 if (ret) 1585 if (ret)
1658 goto out_release_sem; 1586 goto out_release_sem;
1659 goto retry_unlocked; 1587 goto retry_unlocked;
1660 } 1588 }
1661 1589
1662 futex_unlock_mm(fshared);
1663
1664 ret = get_user(uval, uaddr); 1590 ret = get_user(uval, uaddr);
1665 if (!ret && (uval != -EFAULT)) 1591 if (!ret)
1666 goto retry; 1592 goto retry;
1667 1593
1668 if (to) 1594 if (to)
@@ -1675,13 +1601,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1675 * This is the in-kernel slowpath: we look up the PI state (if any), 1601 * This is the in-kernel slowpath: we look up the PI state (if any),
1676 * and do the rt-mutex unlock. 1602 * and do the rt-mutex unlock.
1677 */ 1603 */
1678static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1604static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1679{ 1605{
1680 struct futex_hash_bucket *hb; 1606 struct futex_hash_bucket *hb;
1681 struct futex_q *this, *next; 1607 struct futex_q *this, *next;
1682 u32 uval; 1608 u32 uval;
1683 struct plist_head *head; 1609 struct plist_head *head;
1684 union futex_key key; 1610 union futex_key key = FUTEX_KEY_INIT;
1685 int ret, attempt = 0; 1611 int ret, attempt = 0;
1686 1612
1687retry: 1613retry:
@@ -1692,10 +1618,6 @@ retry:
1692 */ 1618 */
1693 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1619 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1694 return -EPERM; 1620 return -EPERM;
1695 /*
1696 * First take all the futex related locks:
1697 */
1698 futex_lock_mm(fshared);
1699 1621
1700 ret = get_futex_key(uaddr, fshared, &key); 1622 ret = get_futex_key(uaddr, fshared, &key);
1701 if (unlikely(ret != 0)) 1623 if (unlikely(ret != 0))
@@ -1754,34 +1676,30 @@ retry_unlocked:
1754out_unlock: 1676out_unlock:
1755 spin_unlock(&hb->lock); 1677 spin_unlock(&hb->lock);
1756out: 1678out:
1757 futex_unlock_mm(fshared); 1679 put_futex_key(fshared, &key);
1758 1680
1759 return ret; 1681 return ret;
1760 1682
1761pi_faulted: 1683pi_faulted:
1762 /* 1684 /*
1763 * We have to r/w *(int __user *)uaddr, but we can't modify it 1685 * We have to r/w *(int __user *)uaddr, and we have to modify it
1764 * non-atomically. Therefore, if get_user below is not 1686 * atomically. Therefore, if we continue to fault after get_user()
1765 * enough, we need to handle the fault ourselves, while 1687 * below, we need to handle the fault ourselves, while still holding
1766 * still holding the mmap_sem. 1688 * the mmap_sem. This can occur if the uaddr is under contention as
1767 * 1689 * we have to drop the mmap_sem in order to call get_user().
1768 * ... and hb->lock. --ANK
1769 */ 1690 */
1770 spin_unlock(&hb->lock); 1691 spin_unlock(&hb->lock);
1771 1692
1772 if (attempt++) { 1693 if (attempt++) {
1773 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1694 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1774 attempt);
1775 if (ret) 1695 if (ret)
1776 goto out; 1696 goto out;
1777 uval = 0; 1697 uval = 0;
1778 goto retry_unlocked; 1698 goto retry_unlocked;
1779 } 1699 }
1780 1700
1781 futex_unlock_mm(fshared);
1782
1783 ret = get_user(uval, uaddr); 1701 ret = get_user(uval, uaddr);
1784 if (!ret && (uval != -EFAULT)) 1702 if (!ret)
1785 goto retry; 1703 goto retry;
1786 1704
1787 return ret; 1705 return ret;
@@ -1908,8 +1826,7 @@ retry:
1908 * PI futexes happens in exit_pi_state(): 1826 * PI futexes happens in exit_pi_state():
1909 */ 1827 */
1910 if (!pi && (uval & FUTEX_WAITERS)) 1828 if (!pi && (uval & FUTEX_WAITERS))
1911 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1829 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1912 FUTEX_BITSET_MATCH_ANY);
1913 } 1830 }
1914 return 0; 1831 return 0;
1915} 1832}
@@ -2003,18 +1920,22 @@ void exit_robust_list(struct task_struct *curr)
2003long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1920long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2004 u32 __user *uaddr2, u32 val2, u32 val3) 1921 u32 __user *uaddr2, u32 val2, u32 val3)
2005{ 1922{
2006 int ret = -ENOSYS; 1923 int clockrt, ret = -ENOSYS;
2007 int cmd = op & FUTEX_CMD_MASK; 1924 int cmd = op & FUTEX_CMD_MASK;
2008 struct rw_semaphore *fshared = NULL; 1925 int fshared = 0;
2009 1926
2010 if (!(op & FUTEX_PRIVATE_FLAG)) 1927 if (!(op & FUTEX_PRIVATE_FLAG))
2011 fshared = &current->mm->mmap_sem; 1928 fshared = 1;
1929
1930 clockrt = op & FUTEX_CLOCK_REALTIME;
1931 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1932 return -ENOSYS;
2012 1933
2013 switch (cmd) { 1934 switch (cmd) {
2014 case FUTEX_WAIT: 1935 case FUTEX_WAIT:
2015 val3 = FUTEX_BITSET_MATCH_ANY; 1936 val3 = FUTEX_BITSET_MATCH_ANY;
2016 case FUTEX_WAIT_BITSET: 1937 case FUTEX_WAIT_BITSET:
2017 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1938 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2018 break; 1939 break;
2019 case FUTEX_WAKE: 1940 case FUTEX_WAKE:
2020 val3 = FUTEX_BITSET_MATCH_ANY; 1941 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c43..e9d1c8205a3b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -673,6 +673,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
673 struct irq_desc *desc; 673 struct irq_desc *desc;
674 int retval; 674 int retval;
675 675
676 /*
677 * handle_IRQ_event() always ignores IRQF_DISABLED except for
678 * the _first_ irqaction (sigh). That can cause oopsing, but
679 * the behavior is classified as "will not fix" so we need to
680 * start nudging drivers away from using that idiom.
681 */
682 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
683 == (IRQF_SHARED|IRQF_DISABLED))
684 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
685 "guaranteed on shared IRQs\n",
686 irq, devname);
687
676#ifdef CONFIG_LOCKDEP 688#ifdef CONFIG_LOCKDEP
677 /* 689 /*
678 * Lockdep wants atomic interrupt handlers: 690 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 74b1878b8bb8..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -137,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
137#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
139 139
140static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
141{ 141{
142 int i; 142 int i;
143 143
144 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
145 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
146 class->contention_point[i] = ip; 146 points[i] = ip;
147 break; 147 break;
148 } 148 }
149 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
150 break; 150 break;
151 } 151 }
152 152
@@ -186,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
187 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
188 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
189 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
190 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
191 194
@@ -210,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
210 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
211 } 214 }
212 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
213} 217}
214 218
215static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -288,14 +292,12 @@ void lockdep_off(void)
288{ 292{
289 current->lockdep_recursion++; 293 current->lockdep_recursion++;
290} 294}
291
292EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
293 296
294void lockdep_on(void) 297void lockdep_on(void)
295{ 298{
296 current->lockdep_recursion--; 299 current->lockdep_recursion--;
297} 300}
298
299EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
300 302
301/* 303/*
@@ -577,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
577/* 579/*
578 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
579 */ 581 */
580static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
581{ 584{
582 struct lock_list *entry; 585 struct lock_list *entry;
583 586
@@ -2509,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2509 if (subclass) 2512 if (subclass)
2510 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2511} 2514}
2512
2513EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2514 2516
2515/* 2517/*
@@ -2690,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2690} 2692}
2691 2693
2692static int 2694static int
2693__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2694 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2695{ 2698{
2696 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2697 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2718,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2718 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2719 2722
2720found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2721 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2722 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2723 2727
@@ -2902,9 +2906,9 @@ static void check_flags(unsigned long flags)
2902#endif 2906#endif
2903} 2907}
2904 2908
2905void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2906lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2907 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2908{ 2912{
2909 unsigned long flags; 2913 unsigned long flags;
2910 2914
@@ -2914,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2914 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2915 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2916 check_flags(flags); 2920 check_flags(flags);
2917 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2918 check_chain_key(current); 2922 check_chain_key(current);
2919 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2920 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2921} 2925}
2922 2926EXPORT_SYMBOL_GPL(lock_set_class);
2923EXPORT_SYMBOL_GPL(lock_set_subclass);
2924 2927
2925/* 2928/*
2926 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2944,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2944 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2945 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2946} 2949}
2947
2948EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2949 2951
2950void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2962,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2962 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2963 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2964} 2966}
2965
2966EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2967 2968
2968#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -3000,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3000 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3001 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3002 unsigned int depth; 3003 unsigned int depth;
3003 int i, point; 3004 int i, contention_point, contending_point;
3004 3005
3005 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3006 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3024,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3024found_it: 3025found_it:
3025 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3026 3027
3027 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3028 3031
3029 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3030 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3031 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3032 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3033 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3034 put_lock_stats(stats); 3039 put_lock_stats(stats);
3035} 3040}
3036 3041
3037static void 3042static void
3038__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3039{ 3044{
3040 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3041 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3084,6 +3089,7 @@ found_it:
3084 put_lock_stats(stats); 3089 put_lock_stats(stats);
3085 3090
3086 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3087} 3093}
3088 3094
3089void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3105,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3105} 3111}
3106EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3107 3113
3108void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3109{ 3115{
3110 unsigned long flags; 3116 unsigned long flags;
3111 3117
@@ -3118,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3118 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3119 check_flags(flags); 3125 check_flags(flags);
3120 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3121 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3122 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3123 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3124} 3130}
@@ -3442,7 +3448,6 @@ retry:
3442 if (unlock) 3448 if (unlock)
3443 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3444} 3450}
3445
3446EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3447 3452
3448/* 3453/*
@@ -3463,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3463{ 3468{
3464 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3465} 3470}
3466
3467EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3468 3472
3469void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..13f06349a786 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -321,36 +322,27 @@ void oops_exit(void)
321} 322}
322 323
323#ifdef WANT_WARN_ON_SLOWPATH 324#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 325void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 326{
343 va_list args; 327 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 328 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 329 unsigned long caller = (unsigned long)__builtin_return_address(0);
330 const char *board;
331
346 sprint_symbol(function, caller); 332 sprint_symbol(function, caller);
347 333
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 334 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 335 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 336 line, function);
351 va_start(args, fmt); 337 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 338 if (board)
353 va_end(args); 339 printk(KERN_WARNING "Hardware name: %s\n", board);
340
341 if (fmt) {
342 va_start(args, fmt);
343 vprintk(fmt, args);
344 va_end(args);
345 }
354 346
355 print_modules(); 347 print_modules();
356 dump_stack(); 348 dump_stack();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a831de..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..e503a002f330 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
551 } 551 }
552} 552}
553 553
554void rcu_nmi_enter(void)
555{
556 rcu_irq_enter();
557}
558
559void rcu_nmi_exit(void)
560{
561 rcu_irq_exit();
562}
563
554static void dyntick_save_progress_counter(int cpu) 564static void dyntick_save_progress_counter(int cpu)
555{ 565{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..b31065522104 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 if (signal_pending(current)) {
155 mutex_lock(&fullstop_mutex);
156 if (!ACCESS_ONCE(fullstop))
157 fullstop = FULLSTOP_SIGNALED;
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE;
161}
162
139/* 163/*
140 * Allocate an element from the rcu_tortures pool. 164 * Allocate an element from the rcu_tortures pool.
141 */ 165 */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 223static void
200rcu_stutter_wait(void) 224rcu_stutter_wait(void)
201{ 225{
202 while (stutter_pause_test || !rcutorture_runnable) 226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 227 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 228 schedule_timeout_interruptible(1);
205 else 229 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 230 schedule_timeout_interruptible(round_jiffies_relative(HZ));
231 }
207} 232}
208 233
209/* 234/*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 624 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 625 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
603 schedule_timeout_uninterruptible(1); 628 schedule_timeout_uninterruptible(1);
604 return 0; 629 return 0;
605} 630}
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 649 } while (!kthread_should_stop() && !fullstop);
625 650
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
628 schedule_timeout_uninterruptible(1); 653 schedule_timeout_uninterruptible(1);
629 return 0; 654 return 0;
630} 655}
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 760 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 761 del_timer_sync(&t);
737 while (!kthread_should_stop()) 762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
738 schedule_timeout_uninterruptible(1); 763 schedule_timeout_uninterruptible(1);
739 return 0; 764 return 0;
740} 765}
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
831 do { 856 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 857 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 858 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 859 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 861 return 0;
837} 862}
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
899 do { 924 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 925 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 926 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 927 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 928 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 929 return 0;
905} 930}
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
914 do { 939 do {
915 schedule_timeout_interruptible(stutter * HZ); 940 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 941 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 942 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 943 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 944 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 945 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 946 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 947 return 0;
923} 948}
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 959 stutter, irqreader);
935} 960}
936 961
962static struct notifier_block rcutorture_nb = {
963 .notifier_call = rcutorture_shutdown_notify,
964};
965
937static void 966static void
938rcu_torture_cleanup(void) 967rcu_torture_cleanup(void)
939{ 968{
940 int i; 969 int i;
941 970
942 fullstop = 1; 971 mutex_lock(&fullstop_mutex);
972 if (!fullstop) {
973 /* If being signaled, let it happen, then exit. */
974 mutex_unlock(&fullstop_mutex);
975 schedule_timeout_interruptible(10 * HZ);
976 if (cur_ops->cb_barrier != NULL)
977 cur_ops->cb_barrier();
978 return;
979 }
980 fullstop = FULLSTOP_CLEANUP;
981 mutex_unlock(&fullstop_mutex);
982 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 983 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 984 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 985 kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1055 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1056 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1057
1058 mutex_lock(&fullstop_mutex);
1059
1018 /* Process args and tell the world that the torturer is on the job. */ 1060 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1061 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1062 cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1066 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1067 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1068 torture_type);
1069 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1070 return (-EINVAL);
1028 } 1071 }
1029 if (cur_ops->init) 1072 if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
1146 goto unwind; 1189 goto unwind;
1147 } 1190 }
1148 } 1191 }
1192 register_reboot_notifier(&rcutorture_nb);
1193 mutex_unlock(&fullstop_mutex);
1149 return 0; 1194 return 0;
1150 1195
1151unwind: 1196unwind:
1197 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1198 rcu_torture_cleanup();
1153 return firsterr; 1199 return firsterr;
1154} 1200}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
83#endif /* #ifdef CONFIG_NO_HZ */
84
85static int blimit = 10; /* Maximum callbacks per softirq. */
86static int qhimark = 10000; /* If this many pending, ignore blimit. */
87static int qlowmark = 100; /* Once only this many pending, use blimit. */
88
89static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
90
91/*
92 * Return the number of RCU batches processed thus far for debug & stats.
93 */
94long rcu_batches_completed(void)
95{
96 return rcu_state.completed;
97}
98EXPORT_SYMBOL_GPL(rcu_batches_completed);
99
100/*
101 * Return the number of RCU BH batches processed thus far for debug & stats.
102 */
103long rcu_batches_completed_bh(void)
104{
105 return rcu_bh_state.completed;
106}
107EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
108
109/*
110 * Does the CPU have callbacks ready to be invoked?
111 */
112static int
113cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
114{
115 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
116}
117
118/*
119 * Does the current CPU require a yet-as-unscheduled grace period?
120 */
121static int
122cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
123{
124 /* ACCESS_ONCE() because we are accessing outside of lock. */
125 return *rdp->nxttail[RCU_DONE_TAIL] &&
126 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
127}
128
129/*
130 * Return the root node of the specified rcu_state structure.
131 */
132static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
133{
134 return &rsp->node[0];
135}
136
137#ifdef CONFIG_SMP
138
139/*
140 * If the specified CPU is offline, tell the caller that it is in
141 * a quiescent state. Otherwise, whack it with a reschedule IPI.
142 * Grace periods can end up waiting on an offline CPU when that
143 * CPU is in the process of coming online -- it will be added to the
144 * rcu_node bitmasks before it actually makes it online. The same thing
145 * can happen while a CPU is in the process of coming online. Because this
146 * race is quite rare, we check for it after detecting that the grace
147 * period has been delayed rather than checking each and every CPU
148 * each and every time we start a new grace period.
149 */
150static int rcu_implicit_offline_qs(struct rcu_data *rdp)
151{
152 /*
153 * If the CPU is offline, it is in a quiescent state. We can
154 * trust its state not to change because interrupts are disabled.
155 */
156 if (cpu_is_offline(rdp->cpu)) {
157 rdp->offline_fqs++;
158 return 1;
159 }
160
161 /* The CPU is online, so send it a reschedule IPI. */
162 if (rdp->cpu != smp_processor_id())
163 smp_send_reschedule(rdp->cpu);
164 else
165 set_need_resched();
166 rdp->resched_ipi++;
167 return 0;
168}
169
170#endif /* #ifdef CONFIG_SMP */
171
172#ifdef CONFIG_NO_HZ
173static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
174
175/**
176 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
177 *
178 * Enter nohz mode, in other words, -leave- the mode in which RCU
179 * read-side critical sections can occur. (Though RCU read-side
180 * critical sections can occur in irq handlers in nohz mode, a possibility
181 * handled by rcu_irq_enter() and rcu_irq_exit()).
182 */
183void rcu_enter_nohz(void)
184{
185 unsigned long flags;
186 struct rcu_dynticks *rdtp;
187
188 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
189 local_irq_save(flags);
190 rdtp = &__get_cpu_var(rcu_dynticks);
191 rdtp->dynticks++;
192 rdtp->dynticks_nesting--;
193 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
194 local_irq_restore(flags);
195}
196
197/*
198 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
199 *
200 * Exit nohz mode, in other words, -enter- the mode in which RCU
201 * read-side critical sections normally occur.
202 */
203void rcu_exit_nohz(void)
204{
205 unsigned long flags;
206 struct rcu_dynticks *rdtp;
207
208 local_irq_save(flags);
209 rdtp = &__get_cpu_var(rcu_dynticks);
210 rdtp->dynticks++;
211 rdtp->dynticks_nesting++;
212 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
213 local_irq_restore(flags);
214 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
215}
216
217/**
218 * rcu_nmi_enter - inform RCU of entry to NMI context
219 *
220 * If the CPU was idle with dynamic ticks active, and there is no
221 * irq handler running, this updates rdtp->dynticks_nmi to let the
222 * RCU grace-period handling know that the CPU is active.
223 */
224void rcu_nmi_enter(void)
225{
226 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
227
228 if (rdtp->dynticks & 0x1)
229 return;
230 rdtp->dynticks_nmi++;
231 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
232 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
233}
234
235/**
236 * rcu_nmi_exit - inform RCU of exit from NMI context
237 *
238 * If the CPU was idle with dynamic ticks active, and there is no
239 * irq handler running, this updates rdtp->dynticks_nmi to let the
240 * RCU grace-period handling know that the CPU is no longer active.
241 */
242void rcu_nmi_exit(void)
243{
244 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
245
246 if (rdtp->dynticks & 0x1)
247 return;
248 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
249 rdtp->dynticks_nmi++;
250 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
251}
252
253/**
254 * rcu_irq_enter - inform RCU of entry to hard irq context
255 *
256 * If the CPU was idle with dynamic ticks active, this updates the
257 * rdtp->dynticks to let the RCU handling know that the CPU is active.
258 */
259void rcu_irq_enter(void)
260{
261 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
262
263 if (rdtp->dynticks_nesting++)
264 return;
265 rdtp->dynticks++;
266 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
267 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
268}
269
270/**
271 * rcu_irq_exit - inform RCU of exit from hard irq context
272 *
273 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
274 * to put let the RCU handling be aware that the CPU is going back to idle
275 * with no ticks.
276 */
277void rcu_irq_exit(void)
278{
279 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
280
281 if (--rdtp->dynticks_nesting)
282 return;
283 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
284 rdtp->dynticks++;
285 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
286
287 /* If the interrupt queued a callback, get out of dyntick mode. */
288 if (__get_cpu_var(rcu_data).nxtlist ||
289 __get_cpu_var(rcu_bh_data).nxtlist)
290 set_need_resched();
291}
292
293/*
294 * Record the specified "completed" value, which is later used to validate
295 * dynticks counter manipulations. Specify "rsp->completed - 1" to
296 * unconditionally invalidate any future dynticks manipulations (which is
297 * useful at the beginning of a grace period).
298 */
299static void dyntick_record_completed(struct rcu_state *rsp, long comp)
300{
301 rsp->dynticks_completed = comp;
302}
303
304#ifdef CONFIG_SMP
305
306/*
307 * Recall the previously recorded value of the completion for dynticks.
308 */
309static long dyntick_recall_completed(struct rcu_state *rsp)
310{
311 return rsp->dynticks_completed;
312}
313
314/*
315 * Snapshot the specified CPU's dynticks counter so that we can later
316 * credit them with an implicit quiescent state. Return 1 if this CPU
317 * is already in a quiescent state courtesy of dynticks idle mode.
318 */
319static int dyntick_save_progress_counter(struct rcu_data *rdp)
320{
321 int ret;
322 int snap;
323 int snap_nmi;
324
325 snap = rdp->dynticks->dynticks;
326 snap_nmi = rdp->dynticks->dynticks_nmi;
327 smp_mb(); /* Order sampling of snap with end of grace period. */
328 rdp->dynticks_snap = snap;
329 rdp->dynticks_nmi_snap = snap_nmi;
330 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
331 if (ret)
332 rdp->dynticks_fqs++;
333 return ret;
334}
335
336/*
337 * Return true if the specified CPU has passed through a quiescent
338 * state by virtue of being in or having passed through an dynticks
339 * idle state since the last call to dyntick_save_progress_counter()
340 * for this same CPU.
341 */
342static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
343{
344 long curr;
345 long curr_nmi;
346 long snap;
347 long snap_nmi;
348
349 curr = rdp->dynticks->dynticks;
350 snap = rdp->dynticks_snap;
351 curr_nmi = rdp->dynticks->dynticks_nmi;
352 snap_nmi = rdp->dynticks_nmi_snap;
353 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
354
355 /*
356 * If the CPU passed through or entered a dynticks idle phase with
357 * no active irq/NMI handlers, then we can safely pretend that the CPU
358 * already acknowledged the request to pass through a quiescent
359 * state. Either way, that CPU cannot possibly be in an RCU
360 * read-side critical section that started before the beginning
361 * of the current RCU grace period.
362 */
363 if ((curr != snap || (curr & 0x1) == 0) &&
364 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
365 rdp->dynticks_fqs++;
366 return 1;
367 }
368
369 /* Go check for the CPU being offline. */
370 return rcu_implicit_offline_qs(rdp);
371}
372
373#endif /* #ifdef CONFIG_SMP */
374
375#else /* #ifdef CONFIG_NO_HZ */
376
377static void dyntick_record_completed(struct rcu_state *rsp, long comp)
378{
379}
380
381#ifdef CONFIG_SMP
382
383/*
384 * If there are no dynticks, then the only way that a CPU can passively
385 * be in a quiescent state is to be offline. Unlike dynticks idle, which
386 * is a point in time during the prior (already finished) grace period,
387 * an offline CPU is always in a quiescent state, and thus can be
388 * unconditionally applied. So just return the current value of completed.
389 */
390static long dyntick_recall_completed(struct rcu_state *rsp)
391{
392 return rsp->completed;
393}
394
395static int dyntick_save_progress_counter(struct rcu_data *rdp)
396{
397 return 0;
398}
399
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{
402 return rcu_implicit_offline_qs(rdp);
403}
404
405#endif /* #ifdef CONFIG_SMP */
406
407#endif /* #else #ifdef CONFIG_NO_HZ */
408
409#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
410
411static void record_gp_stall_check_time(struct rcu_state *rsp)
412{
413 rsp->gp_start = jiffies;
414 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
415}
416
417static void print_other_cpu_stall(struct rcu_state *rsp)
418{
419 int cpu;
420 long delta;
421 unsigned long flags;
422 struct rcu_node *rnp = rcu_get_root(rsp);
423 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
424 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
425
426 /* Only let one CPU complain about others per time interval. */
427
428 spin_lock_irqsave(&rnp->lock, flags);
429 delta = jiffies - rsp->jiffies_stall;
430 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
431 spin_unlock_irqrestore(&rnp->lock, flags);
432 return;
433 }
434 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
435 spin_unlock_irqrestore(&rnp->lock, flags);
436
437 /* OK, time to rat on our buddy... */
438
439 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
440 for (; rnp_cur < rnp_end; rnp_cur++) {
441 if (rnp_cur->qsmask == 0)
442 continue;
443 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
444 if (rnp_cur->qsmask & (1UL << cpu))
445 printk(" %d", rnp_cur->grplo + cpu);
446 }
447 printk(" (detected by %d, t=%ld jiffies)\n",
448 smp_processor_id(), (long)(jiffies - rsp->gp_start));
449 force_quiescent_state(rsp, 0); /* Kick them all. */
450}
451
452static void print_cpu_stall(struct rcu_state *rsp)
453{
454 unsigned long flags;
455 struct rcu_node *rnp = rcu_get_root(rsp);
456
457 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
458 smp_processor_id(), jiffies - rsp->gp_start);
459 dump_stack();
460 spin_lock_irqsave(&rnp->lock, flags);
461 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
462 rsp->jiffies_stall =
463 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
464 spin_unlock_irqrestore(&rnp->lock, flags);
465 set_need_resched(); /* kick ourselves to get things going. */
466}
467
468static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
469{
470 long delta;
471 struct rcu_node *rnp;
472
473 delta = jiffies - rsp->jiffies_stall;
474 rnp = rdp->mynode;
475 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
476
477 /* We haven't checked in, so go dump stack. */
478 print_cpu_stall(rsp);
479
480 } else if (rsp->gpnum != rsp->completed &&
481 delta >= RCU_STALL_RAT_DELAY) {
482
483 /* They had two time units to dump stack, so complain. */
484 print_other_cpu_stall(rsp);
485 }
486}
487
488#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
489
490static void record_gp_stall_check_time(struct rcu_state *rsp)
491{
492}
493
494static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495{
496}
497
498#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
499
500/*
501 * Update CPU-local rcu_data state to record the newly noticed grace period.
502 * This is used both when we started the grace period and when we notice
503 * that someone else started the grace period.
504 */
505static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
506{
507 rdp->qs_pending = 1;
508 rdp->passed_quiesc = 0;
509 rdp->gpnum = rsp->gpnum;
510 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
511 RCU_JIFFIES_TILL_FORCE_QS;
512}
513
514/*
515 * Did someone else start a new RCU grace period start since we last
516 * checked? Update local state appropriately if so. Must be called
517 * on the CPU corresponding to rdp.
518 */
519static int
520check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
521{
522 unsigned long flags;
523 int ret = 0;
524
525 local_irq_save(flags);
526 if (rdp->gpnum != rsp->gpnum) {
527 note_new_gpnum(rsp, rdp);
528 ret = 1;
529 }
530 local_irq_restore(flags);
531 return ret;
532}
533
534/*
535 * Start a new RCU grace period if warranted, re-initializing the hierarchy
536 * in preparation for detecting the next grace period. The caller must hold
537 * the root node's ->lock, which is released before return. Hard irqs must
538 * be disabled.
539 */
540static void
541rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
542 __releases(rcu_get_root(rsp)->lock)
543{
544 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
545 struct rcu_node *rnp = rcu_get_root(rsp);
546 struct rcu_node *rnp_cur;
547 struct rcu_node *rnp_end;
548
549 if (!cpu_needs_another_gp(rsp, rdp)) {
550 spin_unlock_irqrestore(&rnp->lock, flags);
551 return;
552 }
553
554 /* Advance to a new grace period and initialize state. */
555 rsp->gpnum++;
556 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
557 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
558 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
559 RCU_JIFFIES_TILL_FORCE_QS;
560 record_gp_stall_check_time(rsp);
561 dyntick_record_completed(rsp, rsp->completed - 1);
562 note_new_gpnum(rsp, rdp);
563
564 /*
565 * Because we are first, we know that all our callbacks will
566 * be covered by this upcoming grace period, even the ones
567 * that were registered arbitrarily recently.
568 */
569 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
570 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
571
572 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit;
575 spin_unlock_irqrestore(&rnp->lock, flags);
576 return;
577 }
578
579 spin_unlock(&rnp->lock); /* leave irqs disabled. */
580
581
582 /* Exclude any concurrent CPU-hotplug operations. */
583 spin_lock(&rsp->onofflock); /* irqs already disabled. */
584
585 /*
586 * Set the quiescent-state-needed bits in all the non-leaf RCU
587 * nodes for all currently online CPUs. This operation relies
588 * on the layout of the hierarchy within the rsp->node[] array.
589 * Note that other CPUs will access only the leaves of the
590 * hierarchy, which still indicate that no grace period is in
591 * progress. In addition, we have excluded CPU-hotplug operations.
592 *
593 * We therefore do not need to hold any locks. Any required
594 * memory barriers will be supplied by the locks guarding the
595 * leaf rcu_nodes in the hierarchy.
596 */
597
598 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
599 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
600 rnp_cur->qsmask = rnp_cur->qsmaskinit;
601
602 /*
603 * Now set up the leaf nodes. Here we must be careful. First,
604 * we need to hold the lock in order to exclude other CPUs, which
605 * might be contending for the leaf nodes' locks. Second, as
606 * soon as we initialize a given leaf node, its CPUs might run
607 * up the rest of the hierarchy. We must therefore acquire locks
608 * for each node that we touch during this stage. (But we still
609 * are excluding CPU-hotplug operations.)
610 *
611 * Note that the grace period cannot complete until we finish
612 * the initialization process, as there will be at least one
613 * qsmask bit set in the root node until that time, namely the
614 * one corresponding to this CPU.
615 */
616 rnp_end = &rsp->node[NUM_RCU_NODES];
617 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
618 for (; rnp_cur < rnp_end; rnp_cur++) {
619 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
622 }
623
624 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
625 spin_unlock_irqrestore(&rsp->onofflock, flags);
626}
627
628/*
629 * Advance this CPU's callbacks, but only if the current grace period
630 * has ended. This may be called only from the CPU to whom the rdp
631 * belongs.
632 */
633static void
634rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
635{
636 long completed_snap;
637 unsigned long flags;
638
639 local_irq_save(flags);
640 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
641
642 /* Did another grace period end? */
643 if (rdp->completed != completed_snap) {
644
645 /* Advance callbacks. No harm if list empty. */
646 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
647 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
648 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
649
650 /* Remember that we saw this grace-period completion. */
651 rdp->completed = completed_snap;
652 }
653 local_irq_restore(flags);
654}
655
656/*
657 * Similar to cpu_quiet(), for which it is a helper function. Allows
658 * a group of CPUs to be quieted at one go, though all the CPUs in the
659 * group must be represented by the same leaf rcu_node structure.
660 * That structure's lock must be held upon entry, and it is released
661 * before return.
662 */
663static void
664cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
665 unsigned long flags)
666 __releases(rnp->lock)
667{
668 /* Walk up the rcu_node hierarchy. */
669 for (;;) {
670 if (!(rnp->qsmask & mask)) {
671
672 /* Our bit has already been cleared, so done. */
673 spin_unlock_irqrestore(&rnp->lock, flags);
674 return;
675 }
676 rnp->qsmask &= ~mask;
677 if (rnp->qsmask != 0) {
678
679 /* Other bits still set at this level, so done. */
680 spin_unlock_irqrestore(&rnp->lock, flags);
681 return;
682 }
683 mask = rnp->grpmask;
684 if (rnp->parent == NULL) {
685
686 /* No more levels. Exit loop holding root lock. */
687
688 break;
689 }
690 spin_unlock_irqrestore(&rnp->lock, flags);
691 rnp = rnp->parent;
692 spin_lock_irqsave(&rnp->lock, flags);
693 }
694
695 /*
696 * Get here if we are the last CPU to pass through a quiescent
697 * state for this grace period. Clean up and let rcu_start_gp()
698 * start up the next grace period if one is needed. Note that
699 * we still hold rnp->lock, as required by rcu_start_gp(), which
700 * will release it.
701 */
702 rsp->completed = rsp->gpnum;
703 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
704 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
705}
706
707/*
708 * Record a quiescent state for the specified CPU, which must either be
709 * the current CPU or an offline CPU. The lastcomp argument is used to
710 * make sure we are still in the grace period of interest. We don't want
711 * to end the current grace period based on quiescent states detected in
712 * an earlier grace period!
713 */
714static void
715cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
716{
717 unsigned long flags;
718 unsigned long mask;
719 struct rcu_node *rnp;
720
721 rnp = rdp->mynode;
722 spin_lock_irqsave(&rnp->lock, flags);
723 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
724
725 /*
726 * Someone beat us to it for this grace period, so leave.
727 * The race with GP start is resolved by the fact that we
728 * hold the leaf rcu_node lock, so that the per-CPU bits
729 * cannot yet be initialized -- so we would simply find our
730 * CPU's bit already cleared in cpu_quiet_msk() if this race
731 * occurred.
732 */
733 rdp->passed_quiesc = 0; /* try again later! */
734 spin_unlock_irqrestore(&rnp->lock, flags);
735 return;
736 }
737 mask = rdp->grpmask;
738 if ((rnp->qsmask & mask) == 0) {
739 spin_unlock_irqrestore(&rnp->lock, flags);
740 } else {
741 rdp->qs_pending = 0;
742
743 /*
744 * This GP can't end until cpu checks in, so all of our
745 * callbacks can be processed during the next GP.
746 */
747 rdp = rsp->rda[smp_processor_id()];
748 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
749
750 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
751 }
752}
753
754/*
755 * Check to see if there is a new grace period of which this CPU
756 * is not yet aware, and if so, set up local rcu_data state for it.
757 * Otherwise, see if this CPU has just passed through its first
758 * quiescent state for this grace period, and record that fact if so.
759 */
760static void
761rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
762{
763 /* If there is now a new grace period, record and return. */
764 if (check_for_new_grace_period(rsp, rdp))
765 return;
766
767 /*
768 * Does this CPU still need to do its part for current grace period?
769 * If no, return and let the other CPUs do their part as well.
770 */
771 if (!rdp->qs_pending)
772 return;
773
774 /*
775 * Was there a quiescent state since the beginning of the grace
776 * period? If no, then exit and wait for the next call.
777 */
778 if (!rdp->passed_quiesc)
779 return;
780
781 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
782 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
783}
784
785#ifdef CONFIG_HOTPLUG_CPU
786
787/*
788 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
789 * and move all callbacks from the outgoing CPU to the current one.
790 */
791static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
792{
793 int i;
794 unsigned long flags;
795 long lastcomp;
796 unsigned long mask;
797 struct rcu_data *rdp = rsp->rda[cpu];
798 struct rcu_data *rdp_me;
799 struct rcu_node *rnp;
800
801 /* Exclude any attempts to start a new grace period. */
802 spin_lock_irqsave(&rsp->onofflock, flags);
803
804 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
805 rnp = rdp->mynode;
806 mask = rdp->grpmask; /* rnp->grplo is constant. */
807 do {
808 spin_lock(&rnp->lock); /* irqs already disabled. */
809 rnp->qsmaskinit &= ~mask;
810 if (rnp->qsmaskinit != 0) {
811 spin_unlock(&rnp->lock); /* irqs already disabled. */
812 break;
813 }
814 mask = rnp->grpmask;
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 rnp = rnp->parent;
817 } while (rnp != NULL);
818 lastcomp = rsp->completed;
819
820 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
821
822 /* Being offline is a quiescent state, so go record it. */
823 cpu_quiet(cpu, rsp, rdp, lastcomp);
824
825 /*
826 * Move callbacks from the outgoing CPU to the running CPU.
827 * Note that the outgoing CPU is now quiscent, so it is now
828 * (uncharacteristically) safe to access it rcu_data structure.
829 * Note also that we must carefully retain the order of the
830 * outgoing CPU's callbacks in order for rcu_barrier() to work
831 * correctly. Finally, note that we start all the callbacks
832 * afresh, even those that have passed through a grace period
833 * and are therefore ready to invoke. The theory is that hotplug
834 * events are rare, and that if they are frequent enough to
835 * indefinitely delay callbacks, you have far worse things to
836 * be worrying about.
837 */
838 rdp_me = rsp->rda[smp_processor_id()];
839 if (rdp->nxtlist != NULL) {
840 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
841 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
842 rdp->nxtlist = NULL;
843 for (i = 0; i < RCU_NEXT_SIZE; i++)
844 rdp->nxttail[i] = &rdp->nxtlist;
845 rdp_me->qlen += rdp->qlen;
846 rdp->qlen = 0;
847 }
848 local_irq_restore(flags);
849}
850
851/*
852 * Remove the specified CPU from the RCU hierarchy and move any pending
853 * callbacks that it might have to the current CPU. This code assumes
854 * that at least one CPU in the system will remain running at all times.
855 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
856 */
857static void rcu_offline_cpu(int cpu)
858{
859 __rcu_offline_cpu(cpu, &rcu_state);
860 __rcu_offline_cpu(cpu, &rcu_bh_state);
861}
862
863#else /* #ifdef CONFIG_HOTPLUG_CPU */
864
865static void rcu_offline_cpu(int cpu)
866{
867}
868
869#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
870
871/*
872 * Invoke any RCU callbacks that have made it to the end of their grace
873 * period. Thottle as specified by rdp->blimit.
874 */
875static void rcu_do_batch(struct rcu_data *rdp)
876{
877 unsigned long flags;
878 struct rcu_head *next, *list, **tail;
879 int count;
880
881 /* If no callbacks are ready, just return.*/
882 if (!cpu_has_callbacks_ready_to_invoke(rdp))
883 return;
884
885 /*
886 * Extract the list of ready callbacks, disabling to prevent
887 * races with call_rcu() from interrupt handlers.
888 */
889 local_irq_save(flags);
890 list = rdp->nxtlist;
891 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
892 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
893 tail = rdp->nxttail[RCU_DONE_TAIL];
894 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
895 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
896 rdp->nxttail[count] = &rdp->nxtlist;
897 local_irq_restore(flags);
898
899 /* Invoke callbacks. */
900 count = 0;
901 while (list) {
902 next = list->next;
903 prefetch(next);
904 list->func(list);
905 list = next;
906 if (++count >= rdp->blimit)
907 break;
908 }
909
910 local_irq_save(flags);
911
912 /* Update count, and requeue any remaining callbacks. */
913 rdp->qlen -= count;
914 if (list != NULL) {
915 *tail = rdp->nxtlist;
916 rdp->nxtlist = list;
917 for (count = 0; count < RCU_NEXT_SIZE; count++)
918 if (&rdp->nxtlist == rdp->nxttail[count])
919 rdp->nxttail[count] = tail;
920 else
921 break;
922 }
923
924 /* Reinstate batch limit if we have worked down the excess. */
925 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
926 rdp->blimit = blimit;
927
928 local_irq_restore(flags);
929
930 /* Re-raise the RCU softirq if there are callbacks remaining. */
931 if (cpu_has_callbacks_ready_to_invoke(rdp))
932 raise_softirq(RCU_SOFTIRQ);
933}
934
935/*
936 * Check to see if this CPU is in a non-context-switch quiescent state
937 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
938 * Also schedule the RCU softirq handler.
939 *
940 * This function must be called with hardirqs disabled. It is normally
941 * invoked from the scheduling-clock interrupt. If rcu_pending returns
942 * false, there is no point in invoking rcu_check_callbacks().
943 */
944void rcu_check_callbacks(int cpu, int user)
945{
946 if (user ||
947 (idle_cpu(cpu) && !in_softirq() &&
948 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
949
950 /*
951 * Get here if this CPU took its interrupt from user
952 * mode or from the idle loop, and if this is not a
953 * nested interrupt. In this case, the CPU is in
954 * a quiescent state, so count it.
955 *
956 * No memory barrier is required here because both
957 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
958 * only CPU-local variables that other CPUs neither
959 * access nor modify, at least not while the corresponding
960 * CPU is online.
961 */
962
963 rcu_qsctr_inc(cpu);
964 rcu_bh_qsctr_inc(cpu);
965
966 } else if (!in_softirq()) {
967
968 /*
969 * Get here if this CPU did not take its interrupt from
970 * softirq, in other words, if it is not interrupting
971 * a rcu_bh read-side critical section. This is an _bh
972 * critical section, so count it.
973 */
974
975 rcu_bh_qsctr_inc(cpu);
976 }
977 raise_softirq(RCU_SOFTIRQ);
978}
979
980#ifdef CONFIG_SMP
981
982/*
983 * Scan the leaf rcu_node structures, processing dyntick state for any that
984 * have not yet encountered a quiescent state, using the function specified.
985 * Returns 1 if the current grace period ends while scanning (possibly
986 * because we made it end).
987 */
988static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
989 int (*f)(struct rcu_data *))
990{
991 unsigned long bit;
992 int cpu;
993 unsigned long flags;
994 unsigned long mask;
995 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
996 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
997
998 for (; rnp_cur < rnp_end; rnp_cur++) {
999 mask = 0;
1000 spin_lock_irqsave(&rnp_cur->lock, flags);
1001 if (rsp->completed != lastcomp) {
1002 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1003 return 1;
1004 }
1005 if (rnp_cur->qsmask == 0) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 continue;
1008 }
1009 cpu = rnp_cur->grplo;
1010 bit = 1;
1011 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1012 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1013 mask |= bit;
1014 }
1015 if (mask != 0 && rsp->completed == lastcomp) {
1016
1017 /* cpu_quiet_msk() releases rnp_cur->lock. */
1018 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1019 continue;
1020 }
1021 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1022 }
1023 return 0;
1024}
1025
1026/*
1027 * Force quiescent states on reluctant CPUs, and also detect which
1028 * CPUs are in dyntick-idle mode.
1029 */
1030static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1031{
1032 unsigned long flags;
1033 long lastcomp;
1034 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1035 struct rcu_node *rnp = rcu_get_root(rsp);
1036 u8 signaled;
1037
1038 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1039 return; /* No grace period in progress, nothing to force. */
1040 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1041 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1042 return; /* Someone else is already on the job. */
1043 }
1044 if (relaxed &&
1045 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1046 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1047 goto unlock_ret; /* no emergency and done recently. */
1048 rsp->n_force_qs++;
1049 spin_lock(&rnp->lock);
1050 lastcomp = rsp->completed;
1051 signaled = rsp->signaled;
1052 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1053 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1054 RCU_JIFFIES_TILL_FORCE_QS;
1055 if (lastcomp == rsp->gpnum) {
1056 rsp->n_force_qs_ngp++;
1057 spin_unlock(&rnp->lock);
1058 goto unlock_ret; /* no GP in progress, time updated. */
1059 }
1060 spin_unlock(&rnp->lock);
1061 switch (signaled) {
1062 case RCU_GP_INIT:
1063
1064 break; /* grace period still initializing, ignore. */
1065
1066 case RCU_SAVE_DYNTICK:
1067
1068 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1069 break; /* So gcc recognizes the dead code. */
1070
1071 /* Record dyntick-idle state. */
1072 if (rcu_process_dyntick(rsp, lastcomp,
1073 dyntick_save_progress_counter))
1074 goto unlock_ret;
1075
1076 /* Update state, record completion counter. */
1077 spin_lock(&rnp->lock);
1078 if (lastcomp == rsp->completed) {
1079 rsp->signaled = RCU_FORCE_QS;
1080 dyntick_record_completed(rsp, lastcomp);
1081 }
1082 spin_unlock(&rnp->lock);
1083 break;
1084
1085 case RCU_FORCE_QS:
1086
1087 /* Check dyntick-idle state, send IPI to laggarts. */
1088 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1089 rcu_implicit_dynticks_qs))
1090 goto unlock_ret;
1091
1092 /* Leave state in case more forcing is required. */
1093
1094 break;
1095 }
1096unlock_ret:
1097 spin_unlock_irqrestore(&rsp->fqslock, flags);
1098}
1099
1100#else /* #ifdef CONFIG_SMP */
1101
1102static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103{
1104 set_need_resched();
1105}
1106
1107#endif /* #else #ifdef CONFIG_SMP */
1108
1109/*
1110 * This does the RCU processing work from softirq context for the
1111 * specified rcu_state and rcu_data structures. This may be called
1112 * only from the CPU to whom the rdp belongs.
1113 */
1114static void
1115__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1116{
1117 unsigned long flags;
1118
1119 /*
1120 * If an RCU GP has gone long enough, go check for dyntick
1121 * idle CPUs and, if needed, send resched IPIs.
1122 */
1123 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1124 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1125 force_quiescent_state(rsp, 1);
1126
1127 /*
1128 * Advance callbacks in response to end of earlier grace
1129 * period that some other CPU ended.
1130 */
1131 rcu_process_gp_end(rsp, rdp);
1132
1133 /* Update RCU state based on any recent quiescent states. */
1134 rcu_check_quiescent_state(rsp, rdp);
1135
1136 /* Does this CPU require a not-yet-started grace period? */
1137 if (cpu_needs_another_gp(rsp, rdp)) {
1138 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1139 rcu_start_gp(rsp, flags); /* releases above lock */
1140 }
1141
1142 /* If there are callbacks ready, invoke them. */
1143 rcu_do_batch(rdp);
1144}
1145
1146/*
1147 * Do softirq processing for the current CPU.
1148 */
1149static void rcu_process_callbacks(struct softirq_action *unused)
1150{
1151 /*
1152 * Memory references from any prior RCU read-side critical sections
1153 * executed by the interrupted code must be seen before any RCU
1154 * grace-period manipulations below.
1155 */
1156 smp_mb(); /* See above block comment. */
1157
1158 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1159 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1160
1161 /*
1162 * Memory references from any later RCU read-side critical sections
1163 * executed by the interrupted code must be seen after any RCU
1164 * grace-period manipulations above.
1165 */
1166 smp_mb(); /* See above block comment. */
1167}
1168
1169static void
1170__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1171 struct rcu_state *rsp)
1172{
1173 unsigned long flags;
1174 struct rcu_data *rdp;
1175
1176 head->func = func;
1177 head->next = NULL;
1178
1179 smp_mb(); /* Ensure RCU update seen before callback registry. */
1180
1181 /*
1182 * Opportunistically note grace-period endings and beginnings.
1183 * Note that we might see a beginning right after we see an
1184 * end, but never vice versa, since this CPU has to pass through
1185 * a quiescent state betweentimes.
1186 */
1187 local_irq_save(flags);
1188 rdp = rsp->rda[smp_processor_id()];
1189 rcu_process_gp_end(rsp, rdp);
1190 check_for_new_grace_period(rsp, rdp);
1191
1192 /* Add the callback to our list. */
1193 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1194 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1195
1196 /* Start a new grace period if one not already started. */
1197 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1198 unsigned long nestflag;
1199 struct rcu_node *rnp_root = rcu_get_root(rsp);
1200
1201 spin_lock_irqsave(&rnp_root->lock, nestflag);
1202 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1203 }
1204
1205 /* Force the grace period if too many callbacks or too long waiting. */
1206 if (unlikely(++rdp->qlen > qhimark)) {
1207 rdp->blimit = LONG_MAX;
1208 force_quiescent_state(rsp, 0);
1209 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1210 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1211 force_quiescent_state(rsp, 1);
1212 local_irq_restore(flags);
1213}
1214
1215/*
1216 * Queue an RCU callback for invocation after a grace period.
1217 */
1218void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1219{
1220 __call_rcu(head, func, &rcu_state);
1221}
1222EXPORT_SYMBOL_GPL(call_rcu);
1223
1224/*
1225 * Queue an RCU for invocation after a quicker grace period.
1226 */
1227void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1228{
1229 __call_rcu(head, func, &rcu_bh_state);
1230}
1231EXPORT_SYMBOL_GPL(call_rcu_bh);
1232
1233/*
1234 * Check to see if there is any immediate RCU-related work to be done
1235 * by the current CPU, for the specified type of RCU, returning 1 if so.
1236 * The checks are in order of increasing expense: checks that can be
1237 * carried out against CPU-local state are performed first. However,
1238 * we must check for CPU stalls first, else we might not get a chance.
1239 */
1240static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1241{
1242 rdp->n_rcu_pending++;
1243
1244 /* Check for CPU stalls, if enabled. */
1245 check_cpu_stall(rsp, rdp);
1246
1247 /* Is the RCU core waiting for a quiescent state from this CPU? */
1248 if (rdp->qs_pending)
1249 return 1;
1250
1251 /* Does this CPU have callbacks ready to invoke? */
1252 if (cpu_has_callbacks_ready_to_invoke(rdp))
1253 return 1;
1254
1255 /* Has RCU gone idle with this CPU needing another grace period? */
1256 if (cpu_needs_another_gp(rsp, rdp))
1257 return 1;
1258
1259 /* Has another RCU grace period completed? */
1260 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1261 return 1;
1262
1263 /* Has a new RCU grace period started? */
1264 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1265 return 1;
1266
1267 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1268 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1269 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1270 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1271 return 1;
1272
1273 /* nothing to do */
1274 return 0;
1275}
1276
1277/*
1278 * Check to see if there is any immediate RCU-related work to be done
1279 * by the current CPU, returning 1 if so. This function is part of the
1280 * RCU implementation; it is -not- an exported member of the RCU API.
1281 */
1282int rcu_pending(int cpu)
1283{
1284 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1285 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1286}
1287
1288/*
1289 * Check to see if any future RCU-related work will need to be done
1290 * by the current CPU, even if none need be done immediately, returning
1291 * 1 if so. This function is part of the RCU implementation; it is -not-
1292 * an exported member of the RCU API.
1293 */
1294int rcu_needs_cpu(int cpu)
1295{
1296 /* RCU callbacks either ready or pending? */
1297 return per_cpu(rcu_data, cpu).nxtlist ||
1298 per_cpu(rcu_bh_data, cpu).nxtlist;
1299}
1300
1301/*
1302 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1303 * approach so that we don't have to worry about how long the CPU has
1304 * been gone, or whether it ever was online previously. We do trust the
1305 * ->mynode field, as it is constant for a given struct rcu_data and
1306 * initialized during early boot.
1307 *
1308 * Note that only one online or offline event can be happening at a given
1309 * time. Note also that we can accept some slop in the rsp->completed
1310 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet.
1312 */
1313static void
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{
1316 unsigned long flags;
1317 int i;
1318 long lastcomp;
1319 unsigned long mask;
1320 struct rcu_data *rdp = rsp->rda[cpu];
1321 struct rcu_node *rnp = rcu_get_root(rsp);
1322
1323 /* Set up local state, ensuring consistent view of global state. */
1324 spin_lock_irqsave(&rnp->lock, flags);
1325 lastcomp = rsp->completed;
1326 rdp->completed = lastcomp;
1327 rdp->gpnum = lastcomp;
1328 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1329 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1330 rdp->beenonline = 1; /* We have now been online. */
1331 rdp->passed_quiesc_completed = lastcomp - 1;
1332 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1333 rdp->nxtlist = NULL;
1334 for (i = 0; i < RCU_NEXT_SIZE; i++)
1335 rdp->nxttail[i] = &rdp->nxtlist;
1336 rdp->qlen = 0;
1337 rdp->blimit = blimit;
1338#ifdef CONFIG_NO_HZ
1339 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1340#endif /* #ifdef CONFIG_NO_HZ */
1341 rdp->cpu = cpu;
1342 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1343
1344 /*
1345 * A new grace period might start here. If so, we won't be part
1346 * of it, but that is OK, as we are currently in a quiescent state.
1347 */
1348
1349 /* Exclude any attempts to start a new GP on large systems. */
1350 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1351
1352 /* Add CPU to rcu_node bitmasks. */
1353 rnp = rdp->mynode;
1354 mask = rdp->grpmask;
1355 do {
1356 /* Exclude any attempts to start a new GP on small systems. */
1357 spin_lock(&rnp->lock); /* irqs already disabled. */
1358 rnp->qsmaskinit |= mask;
1359 mask = rnp->grpmask;
1360 spin_unlock(&rnp->lock); /* irqs already disabled. */
1361 rnp = rnp->parent;
1362 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1363
1364 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1365
1366 /*
1367 * A new grace period might start here. If so, we will be part of
1368 * it, and its gpnum will be greater than ours, so we will
1369 * participate. It is also possible for the gpnum to have been
1370 * incremented before this function was called, and the bitmasks
1371 * to not be filled out until now, in which case we will also
1372 * participate due to our gpnum being behind.
1373 */
1374
1375 /* Since it is coming online, the CPU is in a quiescent state. */
1376 cpu_quiet(cpu, rsp, rdp, lastcomp);
1377 local_irq_restore(flags);
1378}
1379
1380static void __cpuinit rcu_online_cpu(int cpu)
1381{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1392}
1393
1394/*
1395 * Handle CPU online/offline notifcation events.
1396 */
1397static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1398 unsigned long action, void *hcpu)
1399{
1400 long cpu = (long)hcpu;
1401
1402 switch (action) {
1403 case CPU_UP_PREPARE:
1404 case CPU_UP_PREPARE_FROZEN:
1405 rcu_online_cpu(cpu);
1406 break;
1407 case CPU_DEAD:
1408 case CPU_DEAD_FROZEN:
1409 case CPU_UP_CANCELED:
1410 case CPU_UP_CANCELED_FROZEN:
1411 rcu_offline_cpu(cpu);
1412 break;
1413 default:
1414 break;
1415 }
1416 return NOTIFY_OK;
1417}
1418
1419/*
1420 * Compute the per-level fanout, either using the exact fanout specified
1421 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1422 */
1423#ifdef CONFIG_RCU_FANOUT_EXACT
1424static void __init rcu_init_levelspread(struct rcu_state *rsp)
1425{
1426 int i;
1427
1428 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1429 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1430}
1431#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1432static void __init rcu_init_levelspread(struct rcu_state *rsp)
1433{
1434 int ccur;
1435 int cprv;
1436 int i;
1437
1438 cprv = NR_CPUS;
1439 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1440 ccur = rsp->levelcnt[i];
1441 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1442 cprv = ccur;
1443 }
1444}
1445#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1446
1447/*
1448 * Helper function for rcu_init() that initializes one rcu_state structure.
1449 */
1450static void __init rcu_init_one(struct rcu_state *rsp)
1451{
1452 int cpustride = 1;
1453 int i;
1454 int j;
1455 struct rcu_node *rnp;
1456
1457 /* Initialize the level-tracking arrays. */
1458
1459 for (i = 1; i < NUM_RCU_LVLS; i++)
1460 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1461 rcu_init_levelspread(rsp);
1462
1463 /* Initialize the elements themselves, starting from the leaves. */
1464
1465 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1466 cpustride *= rsp->levelspread[i];
1467 rnp = rsp->level[i];
1468 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1469 spin_lock_init(&rnp->lock);
1470 rnp->qsmask = 0;
1471 rnp->qsmaskinit = 0;
1472 rnp->grplo = j * cpustride;
1473 rnp->grphi = (j + 1) * cpustride - 1;
1474 if (rnp->grphi >= NR_CPUS)
1475 rnp->grphi = NR_CPUS - 1;
1476 if (i == 0) {
1477 rnp->grpnum = 0;
1478 rnp->grpmask = 0;
1479 rnp->parent = NULL;
1480 } else {
1481 rnp->grpnum = j % rsp->levelspread[i - 1];
1482 rnp->grpmask = 1UL << rnp->grpnum;
1483 rnp->parent = rsp->level[i - 1] +
1484 j / rsp->levelspread[i - 1];
1485 }
1486 rnp->level = i;
1487 }
1488 }
1489}
1490
1491/*
1492 * Helper macro for __rcu_init(). To be used nowhere else!
1493 * Assigns leaf node pointers into each CPU's rcu_data structure.
1494 */
1495#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1496do { \
1497 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1498 j = 0; \
1499 for_each_possible_cpu(i) { \
1500 if (i > rnp[j].grphi) \
1501 j++; \
1502 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1503 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1504 } \
1505} while (0)
1506
1507static struct notifier_block __cpuinitdata rcu_nb = {
1508 .notifier_call = rcu_cpu_notify,
1509};
1510
1511void __init __rcu_init(void)
1512{
1513 int i; /* All used by RCU_DATA_PTR_INIT(). */
1514 int j;
1515 struct rcu_node *rnp;
1516
1517 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1518#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1519 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1520#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1521 rcu_init_one(&rcu_state);
1522 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1523 rcu_init_one(&rcu_bh_state);
1524 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1525
1526 for_each_online_cpu(i)
1527 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1528 /* Register notifier for non-boot CPUs */
1529 register_cpu_notifier(&rcu_nb);
1530 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1531}
1532
1533module_param(blimit, int, 0);
1534module_param(qhimark, int, 0);
1535module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 855 continue;
856 /*
857 * if a resource is "BUSY", it's not a hardware resource
858 * but a driver mapping of such a resource; we don't want
859 * to warn for those; some drivers legitimately map only
860 * partial hardware resources. (example: vesafb)
861 */
862 if (p->flags & IORESOURCE_BUSY)
863 continue;
864
856 printk(KERN_WARNING "resource map sanity check conflict: " 865 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 866 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 867 (unsigned long long)addr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 748ff924a290..22aa9cab3fe5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4192,7 +4192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4192 4192
4193 if (p == rq->idle) { 4193 if (p == rq->idle) {
4194 p->stime = cputime_add(p->stime, steal); 4194 p->stime = cputime_add(p->stime, steal);
4195 account_group_system_time(p, steal);
4196 if (atomic_read(&rq->nr_iowait) > 0) 4195 if (atomic_read(&rq->nr_iowait) > 0)
4197 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4196 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4198 else 4197 else
@@ -4328,7 +4327,7 @@ void __kprobes sub_preempt_count(int val)
4328 /* 4327 /*
4329 * Underflow? 4328 * Underflow?
4330 */ 4329 */
4331 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4330 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4332 return; 4331 return;
4333 /* 4332 /*
4334 * Is the spinlock portion underflowing? 4333 * Is the spinlock portion underflowing?
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..466e75ce271a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be6b7d5..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/sys.c b/kernel/sys.c
index ebe65c2c9873..d356d79e84ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -907,8 +907,8 @@ void do_sys_times(struct tms *tms)
907 struct task_cputime cputime; 907 struct task_cputime cputime;
908 cputime_t cutime, cstime; 908 cputime_t cutime, cstime;
909 909
910 spin_lock_irq(&current->sighand->siglock);
911 thread_group_cputime(current, &cputime); 910 thread_group_cputime(current, &cputime);
911 spin_lock_irq(&current->sighand->siglock);
912 cutime = current->signal->cutime; 912 cutime = current->signal->cutime;
913 cstime = current->signal->cstime; 913 cstime = current->signal->cstime;
914 spin_unlock_irq(&current->sighand->siglock); 914 spin_unlock_irq(&current->sighand->siglock);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..eae594cb6ea9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -252,6 +252,14 @@ config DEBUG_OBJECTS_TIMERS
252 timer routines to track the life time of timer objects and 252 timer routines to track the life time of timer objects and
253 validate the timer operations. 253 validate the timer operations.
254 254
255config DEBUG_OBJECTS_ENABLE_DEFAULT
256 int "debug_objects bootup default value (0-1)"
257 range 0 1
258 default "1"
259 depends on DEBUG_OBJECTS
260 help
261 Debug objects boot parameter default value
262
255config DEBUG_SLAB 263config DEBUG_SLAB
256 bool "Debug slab memory allocations" 264 bool "Debug slab memory allocations"
257 depends on DEBUG_KERNEL && SLAB 265 depends on DEBUG_KERNEL && SLAB
@@ -545,6 +553,16 @@ config DEBUG_SG
545 553
546 If unsure, say N. 554 If unsure, say N.
547 555
556config DEBUG_NOTIFIERS
557 bool "Debug notifier call chains"
558 depends on DEBUG_KERNEL
559 help
560 Enable this to turn on sanity checking for notifier call chains.
561 This is most useful for kernel developers to make sure that
562 modules properly unregister themselves from notifier chains.
563 This is a relatively cheap check but if you care about maximum
564 performance, say N.
565
548config FRAME_POINTER 566config FRAME_POINTER
549 bool "Compile the kernel with frame pointers" 567 bool "Compile the kernel with frame pointers"
550 depends on DEBUG_KERNEL && \ 568 depends on DEBUG_KERNEL && \
@@ -619,6 +637,19 @@ config RCU_CPU_STALL_DETECTOR
619 637
620 Say N if you are unsure. 638 Say N if you are unsure.
621 639
640config RCU_CPU_STALL_DETECTOR
641 bool "Check for stalled CPUs delaying RCU grace periods"
642 depends on CLASSIC_RCU || TREE_RCU
643 default n
644 help
645 This option causes RCU to printk information on which
646 CPUs are delaying the current grace period, but only when
647 the grace period extends for excessive time periods.
648
649 Say Y if you want RCU to perform such checks.
650
651 Say N if you are unsure.
652
622config KPROBES_SANITY_TEST 653config KPROBES_SANITY_TEST
623 bool "Kprobes sanity tests" 654 bool "Kprobes sanity tests"
624 depends on DEBUG_KERNEL 655 depends on DEBUG_KERNEL
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index e3ab374e1334..5d99be1fd988 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -45,7 +45,9 @@ static struct kmem_cache *obj_cache;
45static int debug_objects_maxchain __read_mostly; 45static int debug_objects_maxchain __read_mostly;
46static int debug_objects_fixups __read_mostly; 46static int debug_objects_fixups __read_mostly;
47static int debug_objects_warnings __read_mostly; 47static int debug_objects_warnings __read_mostly;
48static int debug_objects_enabled __read_mostly; 48static int debug_objects_enabled __read_mostly
49 = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT;
50
49static struct debug_obj_descr *descr_test __read_mostly; 51static struct debug_obj_descr *descr_test __read_mostly;
50 52
51static int __init enable_object_debug(char *str) 53static int __init enable_object_debug(char *str)
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5f6c629a924d..fa2dc4e5f9ba 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -21,9 +21,12 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/swiotlb.h>
24#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/swiotlb.h>
25#include <linux/types.h> 27#include <linux/types.h>
26#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/highmem.h>
27 30
28#include <asm/io.h> 31#include <asm/io.h>
29#include <asm/dma.h> 32#include <asm/dma.h>
@@ -36,22 +39,6 @@
36#define OFFSET(val,align) ((unsigned long) \ 39#define OFFSET(val,align) ((unsigned long) \
37 ( (val) & ( (align) - 1))) 40 ( (val) & ( (align) - 1)))
38 41
39#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
40#define SG_ENT_PHYS_ADDRESS(sg) virt_to_bus(SG_ENT_VIRT_ADDRESS(sg))
41
42/*
43 * Maximum allowable number of contiguous slabs to map,
44 * must be a power of 2. What is the appropriate value ?
45 * The complexity of {map,unmap}_single is linearly dependent on this value.
46 */
47#define IO_TLB_SEGSIZE 128
48
49/*
50 * log of the size of each IO TLB slab. The number of slabs is command line
51 * controllable.
52 */
53#define IO_TLB_SHIFT 11
54
55#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) 42#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
56 43
57/* 44/*
@@ -102,7 +89,10 @@ static unsigned int io_tlb_index;
102 * We need to save away the original address corresponding to a mapped entry 89 * We need to save away the original address corresponding to a mapped entry
103 * for the sync operations. 90 * for the sync operations.
104 */ 91 */
105static unsigned char **io_tlb_orig_addr; 92static struct swiotlb_phys_addr {
93 struct page *page;
94 unsigned int offset;
95} *io_tlb_orig_addr;
106 96
107/* 97/*
108 * Protect the above data structures in the map and unmap calls 98 * Protect the above data structures in the map and unmap calls
@@ -126,6 +116,72 @@ setup_io_tlb_npages(char *str)
126__setup("swiotlb=", setup_io_tlb_npages); 116__setup("swiotlb=", setup_io_tlb_npages);
127/* make io_tlb_overflow tunable too? */ 117/* make io_tlb_overflow tunable too? */
128 118
119void * __weak swiotlb_alloc_boot(size_t size, unsigned long nslabs)
120{
121 return alloc_bootmem_low_pages(size);
122}
123
124void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
125{
126 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
127}
128
129dma_addr_t __weak swiotlb_phys_to_bus(phys_addr_t paddr)
130{
131 return paddr;
132}
133
134phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
135{
136 return baddr;
137}
138
139static dma_addr_t swiotlb_virt_to_bus(volatile void *address)
140{
141 return swiotlb_phys_to_bus(virt_to_phys(address));
142}
143
144static void *swiotlb_bus_to_virt(dma_addr_t address)
145{
146 return phys_to_virt(swiotlb_bus_to_phys(address));
147}
148
149int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
150{
151 return 0;
152}
153
154static dma_addr_t swiotlb_sg_to_bus(struct scatterlist *sg)
155{
156 return swiotlb_phys_to_bus(page_to_phys(sg_page(sg)) + sg->offset);
157}
158
159static void swiotlb_print_info(unsigned long bytes)
160{
161 phys_addr_t pstart, pend;
162 dma_addr_t bstart, bend;
163
164 pstart = virt_to_phys(io_tlb_start);
165 pend = virt_to_phys(io_tlb_end);
166
167 bstart = swiotlb_phys_to_bus(pstart);
168 bend = swiotlb_phys_to_bus(pend);
169
170 printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
171 bytes >> 20, io_tlb_start, io_tlb_end);
172 if (pstart != bstart || pend != bend)
173 printk(KERN_INFO "software IO TLB at phys %#llx - %#llx"
174 " bus %#llx - %#llx\n",
175 (unsigned long long)pstart,
176 (unsigned long long)pend,
177 (unsigned long long)bstart,
178 (unsigned long long)bend);
179 else
180 printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
181 (unsigned long long)pstart,
182 (unsigned long long)pend);
183}
184
129/* 185/*
130 * Statically reserve bounce buffer space and initialize bounce buffer data 186 * Statically reserve bounce buffer space and initialize bounce buffer data
131 * structures for the software IO TLB used to implement the DMA API. 187 * structures for the software IO TLB used to implement the DMA API.
@@ -145,7 +201,7 @@ swiotlb_init_with_default_size(size_t default_size)
145 /* 201 /*
146 * Get IO TLB memory from the low pages 202 * Get IO TLB memory from the low pages
147 */ 203 */
148 io_tlb_start = alloc_bootmem_low_pages(bytes); 204 io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs);
149 if (!io_tlb_start) 205 if (!io_tlb_start)
150 panic("Cannot allocate SWIOTLB buffer"); 206 panic("Cannot allocate SWIOTLB buffer");
151 io_tlb_end = io_tlb_start + bytes; 207 io_tlb_end = io_tlb_start + bytes;
@@ -159,7 +215,7 @@ swiotlb_init_with_default_size(size_t default_size)
159 for (i = 0; i < io_tlb_nslabs; i++) 215 for (i = 0; i < io_tlb_nslabs; i++)
160 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); 216 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
161 io_tlb_index = 0; 217 io_tlb_index = 0;
162 io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *)); 218 io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
163 219
164 /* 220 /*
165 * Get the overflow emergency buffer 221 * Get the overflow emergency buffer
@@ -168,8 +224,7 @@ swiotlb_init_with_default_size(size_t default_size)
168 if (!io_tlb_overflow_buffer) 224 if (!io_tlb_overflow_buffer)
169 panic("Cannot allocate SWIOTLB overflow buffer!\n"); 225 panic("Cannot allocate SWIOTLB overflow buffer!\n");
170 226
171 printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n", 227 swiotlb_print_info(bytes);
172 virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
173} 228}
174 229
175void __init 230void __init
@@ -202,8 +257,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
202 bytes = io_tlb_nslabs << IO_TLB_SHIFT; 257 bytes = io_tlb_nslabs << IO_TLB_SHIFT;
203 258
204 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 259 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
205 io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN, 260 io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs);
206 order);
207 if (io_tlb_start) 261 if (io_tlb_start)
208 break; 262 break;
209 order--; 263 order--;
@@ -235,12 +289,12 @@ swiotlb_late_init_with_default_size(size_t default_size)
235 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); 289 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
236 io_tlb_index = 0; 290 io_tlb_index = 0;
237 291
238 io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL, 292 io_tlb_orig_addr = (struct swiotlb_phys_addr *)__get_free_pages(GFP_KERNEL,
239 get_order(io_tlb_nslabs * sizeof(char *))); 293 get_order(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr)));
240 if (!io_tlb_orig_addr) 294 if (!io_tlb_orig_addr)
241 goto cleanup3; 295 goto cleanup3;
242 296
243 memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *)); 297 memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
244 298
245 /* 299 /*
246 * Get the overflow emergency buffer 300 * Get the overflow emergency buffer
@@ -250,9 +304,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
250 if (!io_tlb_overflow_buffer) 304 if (!io_tlb_overflow_buffer)
251 goto cleanup4; 305 goto cleanup4;
252 306
253 printk(KERN_INFO "Placing %luMB software IO TLB between 0x%lx - " 307 swiotlb_print_info(bytes);
254 "0x%lx\n", bytes >> 20,
255 virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
256 308
257 return 0; 309 return 0;
258 310
@@ -279,16 +331,69 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
279 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); 331 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
280} 332}
281 333
334static inline int range_needs_mapping(void *ptr, size_t size)
335{
336 return swiotlb_force || swiotlb_arch_range_needs_mapping(ptr, size);
337}
338
282static int is_swiotlb_buffer(char *addr) 339static int is_swiotlb_buffer(char *addr)
283{ 340{
284 return addr >= io_tlb_start && addr < io_tlb_end; 341 return addr >= io_tlb_start && addr < io_tlb_end;
285} 342}
286 343
344static struct swiotlb_phys_addr swiotlb_bus_to_phys_addr(char *dma_addr)
345{
346 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
347 struct swiotlb_phys_addr buffer = io_tlb_orig_addr[index];
348 buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
349 buffer.page += buffer.offset >> PAGE_SHIFT;
350 buffer.offset &= PAGE_SIZE - 1;
351 return buffer;
352}
353
354static void
355__sync_single(struct swiotlb_phys_addr buffer, char *dma_addr, size_t size, int dir)
356{
357 if (PageHighMem(buffer.page)) {
358 size_t len, bytes;
359 char *dev, *host, *kmp;
360
361 len = size;
362 while (len != 0) {
363 unsigned long flags;
364
365 bytes = len;
366 if ((bytes + buffer.offset) > PAGE_SIZE)
367 bytes = PAGE_SIZE - buffer.offset;
368 local_irq_save(flags); /* protects KM_BOUNCE_READ */
369 kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
370 dev = dma_addr + size - len;
371 host = kmp + buffer.offset;
372 if (dir == DMA_FROM_DEVICE)
373 memcpy(host, dev, bytes);
374 else
375 memcpy(dev, host, bytes);
376 kunmap_atomic(kmp, KM_BOUNCE_READ);
377 local_irq_restore(flags);
378 len -= bytes;
379 buffer.page++;
380 buffer.offset = 0;
381 }
382 } else {
383 void *v = page_address(buffer.page) + buffer.offset;
384
385 if (dir == DMA_TO_DEVICE)
386 memcpy(dma_addr, v, size);
387 else
388 memcpy(v, dma_addr, size);
389 }
390}
391
287/* 392/*
288 * Allocates bounce buffer and returns its kernel virtual address. 393 * Allocates bounce buffer and returns its kernel virtual address.
289 */ 394 */
290static void * 395static void *
291map_single(struct device *hwdev, char *buffer, size_t size, int dir) 396map_single(struct device *hwdev, struct swiotlb_phys_addr buffer, size_t size, int dir)
292{ 397{
293 unsigned long flags; 398 unsigned long flags;
294 char *dma_addr; 399 char *dma_addr;
@@ -298,11 +403,16 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
298 unsigned long mask; 403 unsigned long mask;
299 unsigned long offset_slots; 404 unsigned long offset_slots;
300 unsigned long max_slots; 405 unsigned long max_slots;
406 struct swiotlb_phys_addr slot_buf;
301 407
302 mask = dma_get_seg_boundary(hwdev); 408 mask = dma_get_seg_boundary(hwdev);
303 start_dma_addr = virt_to_bus(io_tlb_start) & mask; 409 start_dma_addr = swiotlb_virt_to_bus(io_tlb_start) & mask;
304 410
305 offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 411 offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
412
413 /*
414 * Carefully handle integer overflow which can occur when mask == ~0UL.
415 */
306 max_slots = mask + 1 416 max_slots = mask + 1
307 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT 417 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
308 : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); 418 : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -378,10 +488,15 @@ found:
378 * This is needed when we sync the memory. Then we sync the buffer if 488 * This is needed when we sync the memory. Then we sync the buffer if
379 * needed. 489 * needed.
380 */ 490 */
381 for (i = 0; i < nslots; i++) 491 slot_buf = buffer;
382 io_tlb_orig_addr[index+i] = buffer + (i << IO_TLB_SHIFT); 492 for (i = 0; i < nslots; i++) {
493 slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
494 slot_buf.offset &= PAGE_SIZE - 1;
495 io_tlb_orig_addr[index+i] = slot_buf;
496 slot_buf.offset += 1 << IO_TLB_SHIFT;
497 }
383 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) 498 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
384 memcpy(dma_addr, buffer, size); 499 __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
385 500
386 return dma_addr; 501 return dma_addr;
387} 502}
@@ -395,17 +510,17 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
395 unsigned long flags; 510 unsigned long flags;
396 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 511 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
397 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; 512 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
398 char *buffer = io_tlb_orig_addr[index]; 513 struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
399 514
400 /* 515 /*
401 * First, sync the memory before unmapping the entry 516 * First, sync the memory before unmapping the entry
402 */ 517 */
403 if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) 518 if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
404 /* 519 /*
405 * bounce... copy the data back into the original buffer * and 520 * bounce... copy the data back into the original buffer * and
406 * delete the bounce buffer. 521 * delete the bounce buffer.
407 */ 522 */
408 memcpy(buffer, dma_addr, size); 523 __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
409 524
410 /* 525 /*
411 * Return the buffer to the free list by setting the corresponding 526 * Return the buffer to the free list by setting the corresponding
@@ -437,21 +552,18 @@ static void
437sync_single(struct device *hwdev, char *dma_addr, size_t size, 552sync_single(struct device *hwdev, char *dma_addr, size_t size,
438 int dir, int target) 553 int dir, int target)
439{ 554{
440 int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; 555 struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
441 char *buffer = io_tlb_orig_addr[index];
442
443 buffer += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
444 556
445 switch (target) { 557 switch (target) {
446 case SYNC_FOR_CPU: 558 case SYNC_FOR_CPU:
447 if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) 559 if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
448 memcpy(buffer, dma_addr, size); 560 __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
449 else 561 else
450 BUG_ON(dir != DMA_TO_DEVICE); 562 BUG_ON(dir != DMA_TO_DEVICE);
451 break; 563 break;
452 case SYNC_FOR_DEVICE: 564 case SYNC_FOR_DEVICE:
453 if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) 565 if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
454 memcpy(dma_addr, buffer, size); 566 __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
455 else 567 else
456 BUG_ON(dir != DMA_FROM_DEVICE); 568 BUG_ON(dir != DMA_FROM_DEVICE);
457 break; 569 break;
@@ -473,7 +585,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
473 dma_mask = hwdev->coherent_dma_mask; 585 dma_mask = hwdev->coherent_dma_mask;
474 586
475 ret = (void *)__get_free_pages(flags, order); 587 ret = (void *)__get_free_pages(flags, order);
476 if (ret && !is_buffer_dma_capable(dma_mask, virt_to_bus(ret), size)) { 588 if (ret && !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(ret), size)) {
477 /* 589 /*
478 * The allocated memory isn't reachable by the device. 590 * The allocated memory isn't reachable by the device.
479 * Fall back on swiotlb_map_single(). 591 * Fall back on swiotlb_map_single().
@@ -488,13 +600,16 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
488 * swiotlb_map_single(), which will grab memory from 600 * swiotlb_map_single(), which will grab memory from
489 * the lowest available address range. 601 * the lowest available address range.
490 */ 602 */
491 ret = map_single(hwdev, NULL, size, DMA_FROM_DEVICE); 603 struct swiotlb_phys_addr buffer;
604 buffer.page = virt_to_page(NULL);
605 buffer.offset = 0;
606 ret = map_single(hwdev, buffer, size, DMA_FROM_DEVICE);
492 if (!ret) 607 if (!ret)
493 return NULL; 608 return NULL;
494 } 609 }
495 610
496 memset(ret, 0, size); 611 memset(ret, 0, size);
497 dev_addr = virt_to_bus(ret); 612 dev_addr = swiotlb_virt_to_bus(ret);
498 613
499 /* Confirm address can be DMA'd by device */ 614 /* Confirm address can be DMA'd by device */
500 if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) { 615 if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) {
@@ -554,8 +669,9 @@ dma_addr_t
554swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, 669swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
555 int dir, struct dma_attrs *attrs) 670 int dir, struct dma_attrs *attrs)
556{ 671{
557 dma_addr_t dev_addr = virt_to_bus(ptr); 672 dma_addr_t dev_addr = swiotlb_virt_to_bus(ptr);
558 void *map; 673 void *map;
674 struct swiotlb_phys_addr buffer;
559 675
560 BUG_ON(dir == DMA_NONE); 676 BUG_ON(dir == DMA_NONE);
561 /* 677 /*
@@ -563,19 +679,22 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
563 * we can safely return the device addr and not worry about bounce 679 * we can safely return the device addr and not worry about bounce
564 * buffering it. 680 * buffering it.
565 */ 681 */
566 if (!address_needs_mapping(hwdev, dev_addr, size) && !swiotlb_force) 682 if (!address_needs_mapping(hwdev, dev_addr, size) &&
683 !range_needs_mapping(ptr, size))
567 return dev_addr; 684 return dev_addr;
568 685
569 /* 686 /*
570 * Oh well, have to allocate and map a bounce buffer. 687 * Oh well, have to allocate and map a bounce buffer.
571 */ 688 */
572 map = map_single(hwdev, ptr, size, dir); 689 buffer.page = virt_to_page(ptr);
690 buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
691 map = map_single(hwdev, buffer, size, dir);
573 if (!map) { 692 if (!map) {
574 swiotlb_full(hwdev, size, dir, 1); 693 swiotlb_full(hwdev, size, dir, 1);
575 map = io_tlb_overflow_buffer; 694 map = io_tlb_overflow_buffer;
576 } 695 }
577 696
578 dev_addr = virt_to_bus(map); 697 dev_addr = swiotlb_virt_to_bus(map);
579 698
580 /* 699 /*
581 * Ensure that the address returned is DMA'ble 700 * Ensure that the address returned is DMA'ble
@@ -605,7 +724,7 @@ void
605swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, 724swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
606 size_t size, int dir, struct dma_attrs *attrs) 725 size_t size, int dir, struct dma_attrs *attrs)
607{ 726{
608 char *dma_addr = bus_to_virt(dev_addr); 727 char *dma_addr = swiotlb_bus_to_virt(dev_addr);
609 728
610 BUG_ON(dir == DMA_NONE); 729 BUG_ON(dir == DMA_NONE);
611 if (is_swiotlb_buffer(dma_addr)) 730 if (is_swiotlb_buffer(dma_addr))
@@ -635,7 +754,7 @@ static void
635swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, 754swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
636 size_t size, int dir, int target) 755 size_t size, int dir, int target)
637{ 756{
638 char *dma_addr = bus_to_virt(dev_addr); 757 char *dma_addr = swiotlb_bus_to_virt(dev_addr);
639 758
640 BUG_ON(dir == DMA_NONE); 759 BUG_ON(dir == DMA_NONE);
641 if (is_swiotlb_buffer(dma_addr)) 760 if (is_swiotlb_buffer(dma_addr))
@@ -666,7 +785,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
666 unsigned long offset, size_t size, 785 unsigned long offset, size_t size,
667 int dir, int target) 786 int dir, int target)
668{ 787{
669 char *dma_addr = bus_to_virt(dev_addr) + offset; 788 char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
670 789
671 BUG_ON(dir == DMA_NONE); 790 BUG_ON(dir == DMA_NONE);
672 if (is_swiotlb_buffer(dma_addr)) 791 if (is_swiotlb_buffer(dma_addr))
@@ -714,18 +833,20 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
714 int dir, struct dma_attrs *attrs) 833 int dir, struct dma_attrs *attrs)
715{ 834{
716 struct scatterlist *sg; 835 struct scatterlist *sg;
717 void *addr; 836 struct swiotlb_phys_addr buffer;
718 dma_addr_t dev_addr; 837 dma_addr_t dev_addr;
719 int i; 838 int i;
720 839
721 BUG_ON(dir == DMA_NONE); 840 BUG_ON(dir == DMA_NONE);
722 841
723 for_each_sg(sgl, sg, nelems, i) { 842 for_each_sg(sgl, sg, nelems, i) {
724 addr = SG_ENT_VIRT_ADDRESS(sg); 843 dev_addr = swiotlb_sg_to_bus(sg);
725 dev_addr = virt_to_bus(addr); 844 if (range_needs_mapping(sg_virt(sg), sg->length) ||
726 if (swiotlb_force ||
727 address_needs_mapping(hwdev, dev_addr, sg->length)) { 845 address_needs_mapping(hwdev, dev_addr, sg->length)) {
728 void *map = map_single(hwdev, addr, sg->length, dir); 846 void *map;
847 buffer.page = sg_page(sg);
848 buffer.offset = sg->offset;
849 map = map_single(hwdev, buffer, sg->length, dir);
729 if (!map) { 850 if (!map) {
730 /* Don't panic here, we expect map_sg users 851 /* Don't panic here, we expect map_sg users
731 to do proper error handling. */ 852 to do proper error handling. */
@@ -735,7 +856,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
735 sgl[0].dma_length = 0; 856 sgl[0].dma_length = 0;
736 return 0; 857 return 0;
737 } 858 }
738 sg->dma_address = virt_to_bus(map); 859 sg->dma_address = swiotlb_virt_to_bus(map);
739 } else 860 } else
740 sg->dma_address = dev_addr; 861 sg->dma_address = dev_addr;
741 sg->dma_length = sg->length; 862 sg->dma_length = sg->length;
@@ -765,11 +886,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
765 BUG_ON(dir == DMA_NONE); 886 BUG_ON(dir == DMA_NONE);
766 887
767 for_each_sg(sgl, sg, nelems, i) { 888 for_each_sg(sgl, sg, nelems, i) {
768 if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) 889 if (sg->dma_address != swiotlb_sg_to_bus(sg))
769 unmap_single(hwdev, bus_to_virt(sg->dma_address), 890 unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
770 sg->dma_length, dir); 891 sg->dma_length, dir);
771 else if (dir == DMA_FROM_DEVICE) 892 else if (dir == DMA_FROM_DEVICE)
772 dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length); 893 dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
773 } 894 }
774} 895}
775EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); 896EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
@@ -798,11 +919,11 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
798 BUG_ON(dir == DMA_NONE); 919 BUG_ON(dir == DMA_NONE);
799 920
800 for_each_sg(sgl, sg, nelems, i) { 921 for_each_sg(sgl, sg, nelems, i) {
801 if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) 922 if (sg->dma_address != swiotlb_sg_to_bus(sg))
802 sync_single(hwdev, bus_to_virt(sg->dma_address), 923 sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
803 sg->dma_length, dir, target); 924 sg->dma_length, dir, target);
804 else if (dir == DMA_FROM_DEVICE) 925 else if (dir == DMA_FROM_DEVICE)
805 dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length); 926 dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
806 } 927 }
807} 928}
808 929
@@ -823,7 +944,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
823int 944int
824swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) 945swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
825{ 946{
826 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); 947 return (dma_addr == swiotlb_virt_to_bus(io_tlb_overflow_buffer));
827} 948}
828 949
829/* 950/*
@@ -835,7 +956,7 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
835int 956int
836swiotlb_dma_supported(struct device *hwdev, u64 mask) 957swiotlb_dma_supported(struct device *hwdev, u64 mask)
837{ 958{
838 return virt_to_bus(io_tlb_end - 1) <= mask; 959 return swiotlb_virt_to_bus(io_tlb_end - 1) <= mask;
839} 960}
840 961
841EXPORT_SYMBOL(swiotlb_map_single); 962EXPORT_SYMBOL(swiotlb_map_single);
diff --git a/mm/memory.c b/mm/memory.c
index f01b7eed6e16..0a2010a9518c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3075,3 +3075,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
3075 } 3075 }
3076 up_read(&current->mm->mmap_sem); 3076 up_read(&current->mm->mmap_sem);
3077} 3077}
3078
3079#ifdef CONFIG_PROVE_LOCKING
3080void might_fault(void)
3081{
3082 might_sleep();
3083 /*
3084 * it would be nicer only to annotate paths which are not under
3085 * pagefault_disable, however that requires a larger audit and
3086 * providing helpers like get_user_atomic.
3087 */
3088 if (!in_atomic() && current->mm)
3089 might_lock_read(&current->mm->mmap_sem);
3090}
3091EXPORT_SYMBOL(might_fault);
3092#endif