aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 16:09:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 16:09:51 -0400
commitb3fec0fe35a4ff048484f1408385a27695d4273b (patch)
tree088c23f098421ea681d9976a83aad73d15be1027
parente1f5b94fd0c93c3e27ede88b7ab652d086dc960f (diff)
parent722f2a6c87f34ee0fd0130a8cf45f81e0705594a (diff)
Merge branch 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/vegard/kmemcheck
* 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/vegard/kmemcheck: (39 commits) signal: fix __send_signal() false positive kmemcheck warning fs: fix do_mount_root() false positive kmemcheck warning fs: introduce __getname_gfp() trace: annotate bitfields in struct ring_buffer_event net: annotate struct sock bitfield c2port: annotate bitfield for kmemcheck net: annotate inet_timewait_sock bitfields ieee1394/csr1212: fix false positive kmemcheck report ieee1394: annotate bitfield net: annotate bitfields in struct inet_sock net: use kmemcheck bitfields API for skbuff kmemcheck: introduce bitfield API kmemcheck: add opcode self-testing at boot x86: unify pte_hidden x86: make _PAGE_HIDDEN conditional kmemcheck: make kconfig accessible for other architectures kmemcheck: enable in the x86 Kconfig kmemcheck: add hooks for the page allocator kmemcheck: add hooks for page- and sg-dma-mappings kmemcheck: don't track page tables ...
-rw-r--r--Documentation/kmemcheck.txt773
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--crypto/xor.c7
-rw-r--r--drivers/ieee1394/csr1212.c2
-rw-r--r--drivers/ieee1394/nodemgr.c5
-rw-r--r--drivers/misc/c2port/core.c2
-rw-r--r--include/linux/c2port.h3
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/interrupt.h14
-rw-r--r--include/linux/kmemcheck.h153
-rw-r--r--include/linux/mm_types.h8
-rw-r--r--include/linux/ring_buffer.h4
-rw-r--r--include/linux/skbuff.h7
-rw-r--r--include/linux/slab.h7
-rw-r--r--include/linux/slab_def.h81
-rw-r--r--include/linux/stacktrace.h3
-rw-r--r--include/net/inet_sock.h14
-rw-r--r--include/net/inet_timewait_sock.h5
-rw-r--r--include/net/sock.h2
-rw-r--r--init/do_mounts.c3
-rw-r--r--init/main.c1
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/Kconfig.kmemcheck91
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile1
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/page_alloc.c18
-rw-r--r--mm/slab.c108
-rw-r--r--mm/slub.c38
-rw-r--r--net/core/skbuff.c8
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c3
71 files changed, 2899 insertions, 128 deletions
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 000000000000..363044609dad
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,773 @@
1GETTING STARTED WITH KMEMCHECK
2==============================
3
4Vegard Nossum <vegardno@ifi.uio.no>
5
6
7Contents
8========
90. Introduction
101. Downloading
112. Configuring and compiling
123. How to use
133.1. Booting
143.2. Run-time enable/disable
153.3. Debugging
163.4. Annotating false positives
174. Reporting errors
185. Technical description
19
20
210. Introduction
22===============
23
24kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
25is a dynamic checker that detects and warns about some uses of uninitialized
26memory.
27
28Userspace programmers might be familiar with Valgrind's memcheck. The main
29difference between memcheck and kmemcheck is that memcheck works for userspace
30programs only, and kmemcheck works for the kernel only. The implementations
31are of course vastly different. Because of this, kmemcheck is not as accurate
32as memcheck, but it turns out to be good enough in practice to discover real
33programmer errors that the compiler is not able to find through static
34analysis.
35
36Enabling kmemcheck on a kernel will probably slow it down to the extent that
37the machine will not be usable for normal workloads such as e.g. an
38interactive desktop. kmemcheck will also cause the kernel to use about twice
39as much memory as normal. For this reason, kmemcheck is strictly a debugging
40feature.
41
42
431. Downloading
44==============
45
46kmemcheck can only be downloaded using git. If you want to write patches
47against the current code, you should use the kmemcheck development branch of
48the tip tree. It is also possible to use the linux-next tree, which also
49includes the latest version of kmemcheck.
50
51Assuming that you've already cloned the linux-2.6.git repository, all you
52have to do is add the -tip tree as a remote, like this:
53
54 $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
55
56To actually download the tree, fetch the remote:
57
58 $ git fetch tip
59
60And to check out a new local branch with the kmemcheck code:
61
62 $ git checkout -b kmemcheck tip/kmemcheck
63
64General instructions for the -tip tree can be found here:
65http://people.redhat.com/mingo/tip.git/readme.txt
66
67
682. Configuring and compiling
69============================
70
71kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
72configuration variables must have specific settings in order for the kmemcheck
73menu to even appear in "menuconfig". These are:
74
75 o CONFIG_CC_OPTIMIZE_FOR_SIZE=n
76
77 This option is located under "General setup" / "Optimize for size".
78
79 Without this, gcc will use certain optimizations that usually lead to
80 false positive warnings from kmemcheck. An example of this is a 16-bit
81 field in a struct, where gcc may load 32 bits, then discard the upper
82 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
83 warning for the upper 16 bits (if they're uninitialized).
84
85 o CONFIG_SLAB=y or CONFIG_SLUB=y
86
87 This option is located under "General setup" / "Choose SLAB
88 allocator".
89
90 o CONFIG_FUNCTION_TRACER=n
91
92 This option is located under "Kernel hacking" / "Tracers" / "Kernel
93 Function Tracer"
94
95 When function tracing is compiled in, gcc emits a call to another
96 function at the beginning of every function. This means that when the
97 page fault handler is called, the ftrace framework will be called
98 before kmemcheck has had a chance to handle the fault. If ftrace then
99 modifies memory that was tracked by kmemcheck, the result is an
100 endless recursive page fault.
101
102 o CONFIG_DEBUG_PAGEALLOC=n
103
104 This option is located under "Kernel hacking" / "Debug page memory
105 allocations".
106
107In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also
108located under "Kernel hacking". With this, you will be able to get line number
109information from the kmemcheck warnings, which is extremely valuable in
110debugging a problem. This option is not mandatory, however, because it slows
111down the compilation process and produces a much bigger kernel image.
112
113Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck:
114trap use of uninitialized memory"). Here follows a description of the
115kmemcheck configuration variables:
116
117 o CONFIG_KMEMCHECK
118
119 This must be enabled in order to use kmemcheck at all...
120
121 o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT
122
123 This option controls the status of kmemcheck at boot-time. "Enabled"
124 will enable kmemcheck right from the start, "disabled" will boot the
125 kernel as normal (but with the kmemcheck code compiled in, so it can
126 be enabled at run-time after the kernel has booted), and "one-shot" is
127 a special mode which will turn kmemcheck off automatically after
128 detecting the first use of uninitialized memory.
129
130 If you are using kmemcheck to actively debug a problem, then you
131 probably want to choose "enabled" here.
132
133 The one-shot mode is mostly useful in automated test setups because it
134 can prevent floods of warnings and increase the chances of the machine
135 surviving in case something is really wrong. In other cases, the one-
136 shot mode could actually be counter-productive because it would turn
137 itself off at the very first error -- in the case of a false positive
138 too -- and this would come in the way of debugging the specific
139 problem you were interested in.
140
141 If you would like to use your kernel as normal, but with a chance to
142 enable kmemcheck in case of some problem, it might be a good idea to
143 choose "disabled" here. When kmemcheck is disabled, most of the run-
144 time overhead is not incurred, and the kernel will be almost as fast
145 as normal.
146
147 o CONFIG_KMEMCHECK_QUEUE_SIZE
148
149 Select the maximum number of error reports to store in an internal
150 (fixed-size) buffer. Since errors can occur virtually anywhere and in
151 any context, we need a temporary storage area which is guaranteed not
152 to generate any other page faults when accessed. The queue will be
153 emptied as soon as a tasklet may be scheduled. If the queue is full,
154 new error reports will be lost.
155
156 The default value of 64 is probably fine. If some code produces more
157 than 64 errors within an irqs-off section, then the code is likely to
158 produce many, many more, too, and these additional reports seldom give
159 any more information (the first report is usually the most valuable
160 anyway).
161
162 This number might have to be adjusted if you are not using serial
163 console or similar to capture the kernel log. If you are using the
164 "dmesg" command to save the log, then getting a lot of kmemcheck
165 warnings might overflow the kernel log itself, and the earlier reports
166 will get lost in that way instead. Try setting this to 10 or so on
167 such a setup.
168
169 o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT
170
171 Select the number of shadow bytes to save along with each entry of the
172 error-report queue. These bytes indicate what parts of an allocation
173 are initialized, uninitialized, etc. and will be displayed when an
174 error is detected to help the debugging of a particular problem.
175
176 The number entered here is actually the logarithm of the number of
177 bytes that will be saved. So if you pick for example 5 here, kmemcheck
178 will save 2^5 = 32 bytes.
179
180 The default value should be fine for debugging most problems. It also
181 fits nicely within 80 columns.
182
183 o CONFIG_KMEMCHECK_PARTIAL_OK
184
185 This option (when enabled) works around certain GCC optimizations that
186 produce 32-bit reads from 16-bit variables where the upper 16 bits are
187 thrown away afterwards.
188
189 The default value (enabled) is recommended. This may of course hide
190 some real errors, but disabling it would probably produce a lot of
191 false positives.
192
193 o CONFIG_KMEMCHECK_BITOPS_OK
194
195 This option silences warnings that would be generated for bit-field
196 accesses where not all the bits are initialized at the same time. This
197 may also hide some real bugs.
198
199 This option is probably obsolete, or it should be replaced with
200 the kmemcheck-/bitfield-annotations for the code in question. The
201 default value is therefore fine.
202
203Now compile the kernel as usual.
204
205
2063. How to use
207=============
208
2093.1. Booting
210============
211
212First some information about the command-line options. There is only one
213option specific to kmemcheck, and this is called "kmemcheck". It can be used
214to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT
215option. Its possible settings are:
216
217 o kmemcheck=0 (disabled)
218 o kmemcheck=1 (enabled)
219 o kmemcheck=2 (one-shot mode)
220
221If SLUB debugging has been enabled in the kernel, it may take precedence over
222kmemcheck in such a way that the slab caches which are under SLUB debugging
223will not be tracked by kmemcheck. In order to ensure that this doesn't happen
224(even though it shouldn't by default), use SLUB's boot option "slub_debug",
225like this: slub_debug=-
226
227In fact, this option may also be used for fine-grained control over SLUB vs.
228kmemcheck. For example, if the command line includes "kmemcheck=1
229slub_debug=,dentry", then SLUB debugging will be used only for the "dentry"
230slab cache, and with kmemcheck tracking all the other caches. This is advanced
231usage, however, and is not generally recommended.
232
233
2343.2. Run-time enable/disable
235============================
236
237When the kernel has booted, it is possible to enable or disable kmemcheck at
238run-time. WARNING: This feature is still experimental and may cause false
239positive warnings to appear. Therefore, try not to use this. If you find that
240it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
241will be happy to take bug reports.
242
243Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.:
244
245 $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
246
247The numbers are the same as for the kmemcheck= command-line option.
248
249
2503.3. Debugging
251==============
252
253A typical report will look something like this:
254
255WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
25680000000000000000000000000000000000000000088ffff0000000000000000
257 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
258 ^
259
260Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
261RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
262RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
263RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
264RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
265RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
266R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
267R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
268FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
269CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
270CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
271DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
272DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
273 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
274 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
275 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
276 [<ffffffff8100c7b5>] int_signal+0x12/0x17
277 [<ffffffffffffffff>] 0xffffffffffffffff
278
279The single most valuable information in this report is the RIP (or EIP on 32-
280bit) value. This will help us pinpoint exactly which instruction that caused
281the warning.
282
283If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do
284is give this address to the addr2line program, like this:
285
286 $ addr2line -e vmlinux -i ffffffff8104ede8
287 arch/x86/include/asm/string_64.h:12
288 include/asm-generic/siginfo.h:287
289 kernel/signal.c:380
290 kernel/signal.c:410
291
292The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must
293be the vmlinux of the kernel that produced the warning in the first place! If
294not, the line number information will almost certainly be wrong.
295
296The "-i" tells addr2line to also print the line numbers of inlined functions.
297In this case, the flag was very important, because otherwise, it would only
298have printed the first line, which is just a call to memcpy(), which could be
299called from a thousand places in the kernel, and is therefore not very useful.
300These inlined functions would not show up in the stack trace above, simply
301because the kernel doesn't load the extra debugging information. This
302technique can of course be used with ordinary kernel oopses as well.
303
304In this case, it's the caller of memcpy() that is interesting, and it can be
305found in include/asm-generic/siginfo.h, line 287:
306
307281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
308282 {
309283 if (from->si_code < 0)
310284 memcpy(to, from, sizeof(*to));
311285 else
312286 /* _sigchld is currently the largest know union member */
313287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
314288 }
315
316Since this was a read (kmemcheck usually warns about reads only, though it can
317warn about writes to unallocated or freed memory as well), it was probably the
318"from" argument which contained some uninitialized bytes. Following the chain
319of calls, we move upwards to see where "from" was allocated or initialized,
320kernel/signal.c, line 380:
321
322359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
323360 {
324...
325367 list_for_each_entry(q, &list->list, list) {
326368 if (q->info.si_signo == sig) {
327369 if (first)
328370 goto still_pending;
329371 first = q;
330...
331377 if (first) {
332378 still_pending:
333379 list_del_init(&first->list);
334380 copy_siginfo(info, &first->info);
335381 __sigqueue_free(first);
336...
337392 }
338393 }
339
340Here, it is &first->info that is being passed on to copy_siginfo(). The
341variable "first" was found on a list -- passed in as the second argument to
342collect_signal(). We continue our journey through the stack, to figure out
343where the item on "list" was allocated or initialized. We move to line 410:
344
345395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
346396 siginfo_t *info)
347397 {
348...
349410 collect_signal(sig, pending, info);
350...
351414 }
352
353Now we need to follow the "pending" pointer, since that is being passed on to
354collect_signal() as "list". At this point, we've run out of lines from the
355"addr2line" output. Not to worry, we just paste the next addresses from the
356kmemcheck stack dump, i.e.:
357
358 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
359 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
360 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
361 [<ffffffff8100c7b5>] int_signal+0x12/0x17
362
363 $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
364 ffffffff8100b87d ffffffff8100c7b5
365 kernel/signal.c:446
366 kernel/signal.c:1806
367 arch/x86/kernel/signal.c:805
368 arch/x86/kernel/signal.c:871
369 arch/x86/kernel/entry_64.S:694
370
371Remember that since these addresses were found on the stack and not as the
372RIP value, they actually point to the _next_ instruction (they are return
373addresses). This becomes obvious when we look at the code for line 446:
374
375422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
376423 {
377...
378431 signr = __dequeue_signal(&tsk->signal->shared_pending,
379432 mask, info);
380433 /*
381434 * itimer signal ?
382435 *
383436 * itimers are process shared and we restart periodic
384437 * itimers in the signal delivery path to prevent DoS
385438 * attacks in the high resolution timer case. This is
386439 * compliant with the old way of self restarting
387440 * itimers, as the SIGALRM is a legacy signal and only
388441 * queued once. Changing the restart behaviour to
389442 * restart the timer in the signal dequeue path is
390443 * reducing the timer noise on heavy loaded !highres
391444 * systems too.
392445 */
393446 if (unlikely(signr == SIGALRM)) {
394...
395489 }
396
397So instead of looking at 446, we should be looking at 431, which is the line
398that executes just before 446. Here we see that what we are looking for is
399&tsk->signal->shared_pending.
400
401Our next task is now to figure out which function that puts items on this
402"shared_pending" list. A crude, but efficient tool, is git grep:
403
404 $ git grep -n 'shared_pending' kernel/
405 ...
406 kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
407 kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
408 ...
409
410There were more results, but none of them were related to list operations,
411and these were the only assignments. We inspect the line numbers more closely
412and find that this is indeed where items are being added to the list:
413
414816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
415817 int group)
416818 {
417...
418828 pending = group ? &t->signal->shared_pending : &t->pending;
419...
420851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
421852 (is_si_special(info) ||
422853 info->si_code >= 0)));
423854 if (q) {
424855 list_add_tail(&q->list, &pending->list);
425...
426890 }
427
428and:
429
4301309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
4311310 {
432....
4331339 pending = group ? &t->signal->shared_pending : &t->pending;
4341340 list_add_tail(&q->list, &pending->list);
435....
4361347 }
437
438In the first case, the list element we are looking for, "q", is being returned
439from the function __sigqueue_alloc(), which looks like an allocation function.
440Let's take a look at it:
441
442187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
443188 int override_rlimit)
444189 {
445190 struct sigqueue *q = NULL;
446191 struct user_struct *user;
447192
448193 /*
449194 * We won't get problems with the target's UID changing under us
450195 * because changing it requires RCU be used, and if t != current, the
451196 * caller must be holding the RCU readlock (by way of a spinlock) and
452197 * we use RCU protection here
453198 */
454199 user = get_uid(__task_cred(t)->user);
455200 atomic_inc(&user->sigpending);
456201 if (override_rlimit ||
457202 atomic_read(&user->sigpending) <=
458203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
459204 q = kmem_cache_alloc(sigqueue_cachep, flags);
460205 if (unlikely(q == NULL)) {
461206 atomic_dec(&user->sigpending);
462207 free_uid(user);
463208 } else {
464209 INIT_LIST_HEAD(&q->list);
465210 q->flags = 0;
466211 q->user = user;
467212 }
468213
469214 return q;
470215 }
471
472We see that this function initializes q->list, q->flags, and q->user. It seems
473that now is the time to look at the definition of "struct sigqueue", e.g.:
474
47514 struct sigqueue {
47615 struct list_head list;
47716 int flags;
47817 siginfo_t info;
47918 struct user_struct *user;
48019 };
481
482And, you might remember, it was a memcpy() on &first->info that caused the
483warning, so this makes perfect sense. It also seems reasonable to assume that
484it is the caller of __sigqueue_alloc() that has the responsibility of filling
485out (initializing) this member.
486
487But just which fields of the struct were uninitialized? Let's look at
488kmemcheck's report again:
489
490WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
49180000000000000000000000000000000000000000088ffff0000000000000000
492 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
493 ^
494
495These first two lines are the memory dump of the memory object itself, and the
496shadow bytemap, respectively. The memory object itself is in this case
497&first->info. Just beware that the start of this dump is NOT the start of the
498object itself! The position of the caret (^) corresponds with the address of
499the read (ffff88003e4a2024).
500
501The shadow bytemap dump legend is as follows:
502
503 i - initialized
504 u - uninitialized
505 a - unallocated (memory has been allocated by the slab layer, but has not
506 yet been handed off to anybody)
507 f - freed (memory has been allocated by the slab layer, but has been freed
508 by the previous owner)
509
510In order to figure out where (relative to the start of the object) the
511uninitialized memory was located, we have to look at the disassembly. For
512that, we'll need the RIP address again:
513
514RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
515
516 $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
517 ffffffff8104edc8: mov %r8,0x8(%r8)
518 ffffffff8104edcc: test %r10d,%r10d
519 ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168>
520 ffffffff8104edd5: mov %rax,%rdx
521 ffffffff8104edd8: mov $0xc,%ecx
522 ffffffff8104eddd: mov %r13,%rdi
523 ffffffff8104ede0: mov $0x30,%eax
524 ffffffff8104ede5: mov %rdx,%rsi
525 ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi)
526 ffffffff8104edea: test $0x2,%al
527 ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0>
528 ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi)
529 ffffffff8104edf0: test $0x1,%al
530 ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5>
531 ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi)
532 ffffffff8104edf5: mov %r8,%rdi
533 ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free>
534
535As expected, it's the "rep movsl" instruction from the memcpy() that causes
536the warning. We know about REP MOVSL that it uses the register RCX to count
537the number of remaining iterations. By taking a look at the register dump
538again (from the kmemcheck report), we can figure out how many bytes were left
539to copy:
540
541RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
542
543By looking at the disassembly, we also see that %ecx is being loaded with the
544value $0xc just before (ffffffff8104edd8), so we are very lucky. Keep in mind
545that this is the number of iterations, not bytes. And since this is a "long"
546operation, we need to multiply by 4 to get the number of bytes. So this means
547that the uninitialized value was encountered at 4 * (0xc - 0x9) = 12 bytes
548from the start of the object.
549
550We can now try to figure out which field of the "struct siginfo" that was not
551initialized. This is the beginning of the struct:
552
55340 typedef struct siginfo {
55441 int si_signo;
55542 int si_errno;
55643 int si_code;
55744
55845 union {
559..
56092 } _sifields;
56193 } siginfo_t;
562
563On 64-bit, the int is 4 bytes long, so it must the the union member that has
564not been initialized. We can verify this using gdb:
565
566 $ gdb vmlinux
567 ...
568 (gdb) p &((struct siginfo *) 0)->_sifields
569 $1 = (union {...} *) 0x10
570
571Actually, it seems that the union member is located at offset 0x10 -- which
572means that gcc has inserted 4 bytes of padding between the members si_code
573and _sifields. We can now get a fuller picture of the memory dump:
574
575 _----------------------------=> si_code
576 / _--------------------=> (padding)
577 | / _------------=> _sifields(._kill._pid)
578 | | / _----=> _sifields(._kill._uid)
579 | | | /
580-------|-------|-------|-------|
58180000000000000000000000000000000000000000088ffff0000000000000000
582 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
583
584This allows us to realize another important fact: si_code contains the value
5850x80. Remember that x86 is little endian, so the first 4 bytes "80000000" are
586really the number 0x00000080. With a bit of research, we find that this is
587actually the constant SI_KERNEL defined in include/asm-generic/siginfo.h:
588
589144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
590
591This macro is used in exactly one place in the x86 kernel: In send_signal()
592in kernel/signal.c:
593
594816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
595817 int group)
596818 {
597...
598828 pending = group ? &t->signal->shared_pending : &t->pending;
599...
600851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
601852 (is_si_special(info) ||
602853 info->si_code >= 0)));
603854 if (q) {
604855 list_add_tail(&q->list, &pending->list);
605856 switch ((unsigned long) info) {
606...
607865 case (unsigned long) SEND_SIG_PRIV:
608866 q->info.si_signo = sig;
609867 q->info.si_errno = 0;
610868 q->info.si_code = SI_KERNEL;
611869 q->info.si_pid = 0;
612870 q->info.si_uid = 0;
613871 break;
614...
615890 }
616
617Not only does this match with the .si_code member, it also matches the place
618we found earlier when looking for where siginfo_t objects are enqueued on the
619"shared_pending" list.
620
621So to sum up: It seems that it is the padding introduced by the compiler
622between two struct fields that is uninitialized, and this gets reported when
623we do a memcpy() on the struct. This means that we have identified a false
624positive warning.
625
626Normally, kmemcheck will not report uninitialized accesses in memcpy() calls
627when both the source and destination addresses are tracked. (Instead, we copy
628the shadow bytemap as well). In this case, the destination address clearly
629was not tracked. We can dig a little deeper into the stack trace from above:
630
631 arch/x86/kernel/signal.c:805
632 arch/x86/kernel/signal.c:871
633 arch/x86/kernel/entry_64.S:694
634
635And we clearly see that the destination siginfo object is located on the
636stack:
637
638782 static void do_signal(struct pt_regs *regs)
639783 {
640784 struct k_sigaction ka;
641785 siginfo_t info;
642...
643804 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
644...
645854 }
646
647And this &info is what eventually gets passed to copy_siginfo() as the
648destination argument.
649
650Now, even though we didn't find an actual error here, the example is still a
651good one, because it shows how one would go about to find out what the report
652was all about.
653
654
6553.4. Annotating false positives
656===============================
657
658There are a few different ways to make annotations in the source code that
659will keep kmemcheck from checking and reporting certain allocations. Here
660they are:
661
662 o __GFP_NOTRACK_FALSE_POSITIVE
663
664 This flag can be passed to kmalloc() or kmem_cache_alloc() (therefore
665 also to other functions that end up calling one of these) to indicate
666 that the allocation should not be tracked because it would lead to
667 a false positive report. This is a "big hammer" way of silencing
668 kmemcheck; after all, even if the false positive pertains to
669 particular field in a struct, for example, we will now lose the
670 ability to find (real) errors in other parts of the same struct.
671
672 Example:
673
674 /* No warnings will ever trigger on accessing any part of x */
675 x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
676
677 o kmemcheck_bitfield_begin(name)/kmemcheck_bitfield_end(name) and
678 kmemcheck_annotate_bitfield(ptr, name)
679
680 The first two of these three macros can be used inside struct
681 definitions to signal, respectively, the beginning and end of a
682 bitfield. Additionally, this will assign the bitfield a name, which
683 is given as an argument to the macros.
684
685 Having used these markers, one can later use
686 kmemcheck_annotate_bitfield() at the point of allocation, to indicate
687 which parts of the allocation is part of a bitfield.
688
689 Example:
690
691 struct foo {
692 int x;
693
694 kmemcheck_bitfield_begin(flags);
695 int flag_a:1;
696 int flag_b:1;
697 kmemcheck_bitfield_end(flags);
698
699 int y;
700 };
701
702 struct foo *x = kmalloc(sizeof *x);
703
704 /* No warnings will trigger on accessing the bitfield of x */
705 kmemcheck_annotate_bitfield(x, flags);
706
707 Note that kmemcheck_annotate_bitfield() can be used even before the
708 return value of kmalloc() is checked -- in other words, passing NULL
709 as the first argument is legal (and will do nothing).
710
711
7124. Reporting errors
713===================
714
715As we have seen, kmemcheck will produce false positive reports. Therefore, it
716is not very wise to blindly post kmemcheck warnings to mailing lists and
717maintainers. Instead, I encourage maintainers and developers to find errors
718in their own code. If you get a warning, you can try to work around it, try
719to figure out if it's a real error or not, or simply ignore it. Most
720developers know their own code and will quickly and efficiently determine the
721root cause of a kmemcheck report. This is therefore also the most efficient
722way to work with kmemcheck.
723
724That said, we (the kmemcheck maintainers) will always be on the lookout for
725false positives that we can annotate and silence. So whatever you find,
726please drop us a note privately! Kernel configs and steps to reproduce (if
727available) are of course a great help too.
728
729Happy hacking!
730
731
7325. Technical description
733========================
734
735kmemcheck works by marking memory pages non-present. This means that whenever
736somebody attempts to access the page, a page fault is generated. The page
737fault handler notices that the page was in fact only hidden, and so it calls
738on the kmemcheck code to make further investigations.
739
740When the investigations are completed, kmemcheck "shows" the page by marking
741it present (as it would be under normal circumstances). This way, the
742interrupted code can continue as usual.
743
744But after the instruction has been executed, we should hide the page again, so
745that we can catch the next access too! Now kmemcheck makes use of a debugging
746feature of the processor, namely single-stepping. When the processor has
747finished the one instruction that generated the memory access, a debug
748exception is raised. From here, we simply hide the page again and continue
749execution, this time with the single-stepping feature turned off.
750
751kmemcheck requires some assistance from the memory allocator in order to work.
752The memory allocator needs to
753
754 1. Tell kmemcheck about newly allocated pages and pages that are about to
755 be freed. This allows kmemcheck to set up and tear down the shadow memory
756 for the pages in question. The shadow memory stores the status of each
757 byte in the allocation proper, e.g. whether it is initialized or
758 uninitialized.
759
760 2. Tell kmemcheck which parts of memory should be marked uninitialized.
761 There are actually a few more states, such as "not yet allocated" and
762 "recently freed".
763
764If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
765memory that can take page faults because of kmemcheck.
766
767If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
768request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
769This does not prevent the page faults from occurring, however, but marks the
770object in question as being initialized so that no warnings will ever be
771produced for this object.
772
773Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/MAINTAINERS b/MAINTAINERS
index 685784cc023b..af8ef6527f22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3406,6 +3406,14 @@ F: drivers/serial/kgdboc.c
3406F: include/linux/kgdb.h 3406F: include/linux/kgdb.h
3407F: kernel/kgdb.c 3407F: kernel/kgdb.c
3408 3408
3409KMEMCHECK
3410P: Vegard Nossum
3411M: vegardno@ifi.uio.no
3412P Pekka Enberg
3413M: penberg@cs.helsinki.fi
3414L: linux-kernel@vger.kernel.org
3415S: Maintained
3416
3409KMEMLEAK 3417KMEMLEAK
3410P: Catalin Marinas 3418P: Catalin Marinas
3411M: catalin.marinas@arm.com 3419M: catalin.marinas@arm.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 356d2ec8e2fb..cf42fc305419 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
46 select HAVE_KERNEL_GZIP 46 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 select HAVE_ARCH_KMEMCHECK
49 50
50config OUTPUT_FORMAT 51config OUTPUT_FORMAT
51 string 52 string
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca62067..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
81 endif 81 endif
82endif 82endif
83 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
84# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
85sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
86sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..b93405b228b4 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
60 dma_addr_t addr; 61 dma_addr_t addr;
61 62
62 BUG_ON(!valid_dma_direction(dir)); 63 BUG_ON(!valid_dma_direction(dir));
64 kmemcheck_mark_initialized(ptr, size);
63 addr = ops->map_page(hwdev, virt_to_page(ptr), 65 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size, 66 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL); 67 dir, NULL);
@@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg,
87{ 89{
88 struct dma_map_ops *ops = get_dma_ops(hwdev); 90 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents; 91 int ents;
92 struct scatterlist *s;
93 int i;
90 94
91 BUG_ON(!valid_dma_direction(dir)); 95 BUG_ON(!valid_dma_direction(dir));
96 for_each_sg(sg, s, nents, i)
97 kmemcheck_mark_initialized(sg_virt(s), s->length);
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL); 98 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir); 99 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94 100
@@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
200 dma_addr_t addr; 206 dma_addr_t addr;
201 207
202 BUG_ON(!valid_dma_direction(dir)); 208 BUG_ON(!valid_dma_direction(dir));
209 kmemcheck_mark_initialized(page_address(page) + offset, size);
203 addr = ops->map_page(dev, page, offset, size, dir, NULL); 210 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 211 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205 212
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf2631..3cc06e3fceb8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -317,6 +317,11 @@ static inline int pte_present(pte_t a)
317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
318} 318}
319 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
320static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
321{ 326{
322 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98c..b0783520988b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -154,9 +154,9 @@ struct thread_info {
154 154
155/* thread information allocation */ 155/* thread information allocation */
156#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
157#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
158#else 158#else
159#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
160#endif 160#endif
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index daed39ba2614..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649bd..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
63 task_xstate_cachep = 63 task_xstate_cachep =
64 kmem_cache_create("task_xstate", xstate_size, 64 kmem_cache_create("task_xstate", xstate_size,
65 __alignof__(union thread_xstate), 65 __alignof__(union thread_xstate),
66 SLAB_PANIC, NULL); 66 SLAB_PANIC | SLAB_NOTRACK, NULL);
67} 67}
68 68
69/* 69/*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1e1e27b7d438..5f935f0d5861 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 535
535 get_debugreg(condition, 6); 536 get_debugreg(condition, 6);
536 537
538 /* Catch kmemcheck conditions first of all! */
539 if (condition & DR_STEP && kmemcheck_trap(regs))
540 return;
541
537 /* 542 /*
538 * The processor cleared BTF, so don't mark that we need it set. 543 * The processor cleared BTF, so don't mark that we need it set.
539 */ 544 */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..baa0e86adfbc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -14,6 +14,7 @@
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
17 18
18/* 19/*
19 * Page fault error code bits: 20 * Page fault error code bits:
@@ -956,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
956 /* Get the faulting address: */ 957 /* Get the faulting address: */
957 address = read_cr2(); 958 address = read_cr2();
958 959
960 /*
961 * Detect and handle instructions that would cause a page fault for
962 * both a tracked kernel page and a userspace page.
963 */
964 if (kmemcheck_active(regs))
965 kmemcheck_hide(regs);
966
959 if (unlikely(kmmio_fault(regs, address))) 967 if (unlikely(kmmio_fault(regs, address)))
960 return; 968 return;
961 969
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
973 * protection error (error_code & 9) == 0. 981 * protection error (error_code & 9) == 0.
974 */ 982 */
975 if (unlikely(fault_in_kernel_space(address))) { 983 if (unlikely(fault_in_kernel_space(address))) {
976 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 984 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
977 vmalloc_fault(address) >= 0) 985 if (vmalloc_fault(address) >= 0)
978 return; 986 return;
987
988 if (kmemcheck_fault(regs, address, error_code))
989 return;
990 }
979 991
980 /* Can handle a stale RO->RW TLB: */ 992 /* Can handle a stale RO->RW TLB: */
981 if (spurious_fault(error_code, address)) 993 if (spurious_fault(error_code, address))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..f53b57e4086f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -213,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
213 if (!after_bootmem) 213 if (!after_bootmem)
214 init_gbpages(); 214 init_gbpages();
215 215
216#ifdef CONFIG_DEBUG_PAGEALLOC 216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
217 /* 217 /*
218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
219 * This will simplify cpa(), which otherwise needs to support splitting 219 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ff3c0816d15..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
111 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
112 112
113 if (after_bootmem) { 113 if (after_bootmem) {
114#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..9c543290a813 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void)
104 void *ptr; 104 void *ptr;
105 105
106 if (after_bootmem) 106 if (after_bootmem)
107 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
108 else 108 else
109 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
110 110
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
281 void *adr; 281 void *adr;
282 282
283 if (after_bootmem) { 283 if (after_bootmem) {
284 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
285 *phys = __pa(adr); 285 *phys = __pa(adr);
286 286
287 return adr; 287 return adr;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..3cfe9ced8a4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
470 470
471 if (!debug_pagealloc) 471 if (!debug_pagealloc)
472 spin_unlock(&cpa_lock); 472 spin_unlock(&cpa_lock);
473 base = alloc_pages(GFP_KERNEL, 0); 473 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
474 if (!debug_pagealloc) 474 if (!debug_pagealloc)
475 spin_lock(&cpa_lock); 475 spin_lock(&cpa_lock);
476 if (!base) 476 if (!base)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..8e43bdd45456 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;
diff --git a/crypto/xor.c b/crypto/xor.c
index 996b6ee57d9e..fc5b836f3430 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -101,7 +101,12 @@ calibrate_xor_blocks(void)
101 void *b1, *b2; 101 void *b1, *b2;
102 struct xor_block_template *f, *fastest; 102 struct xor_block_template *f, *fastest;
103 103
104 b1 = (void *) __get_free_pages(GFP_KERNEL, 2); 104 /*
105 * Note: Since the memory is not actually used for _anything_ but to
106 * test the XOR speed, we don't really want kmemcheck to warn about
107 * reading uninitialized bytes here.
108 */
109 b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
105 if (!b1) { 110 if (!b1) {
106 printk(KERN_WARNING "xor: Yikes! No memory available.\n"); 111 printk(KERN_WARNING "xor: Yikes! No memory available.\n");
107 return -ENOMEM; 112 return -ENOMEM;
diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index a6dfeb0b3372..e76cac64c533 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/errno.h> 36#include <linux/errno.h>
37#include <linux/kernel.h> 37#include <linux/kernel.h>
38#include <linux/kmemcheck.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <asm/bug.h> 40#include <asm/bug.h>
40#include <asm/byteorder.h> 41#include <asm/byteorder.h>
@@ -387,6 +388,7 @@ csr1212_new_descriptor_leaf(u8 dtype, u32 specifier_id,
387 if (!kv) 388 if (!kv)
388 return NULL; 389 return NULL;
389 390
391 kmemcheck_annotate_variable(kv->value.leaf.data[0]);
390 CSR1212_DESCRIPTOR_LEAF_SET_TYPE(kv, dtype); 392 CSR1212_DESCRIPTOR_LEAF_SET_TYPE(kv, dtype);
391 CSR1212_DESCRIPTOR_LEAF_SET_SPECIFIER_ID(kv, specifier_id); 393 CSR1212_DESCRIPTOR_LEAF_SET_SPECIFIER_ID(kv, specifier_id);
392 394
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index a6d55bebe61a..5122b5a8aa2d 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/kmemcheck.h>
13#include <linux/list.h> 14#include <linux/list.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/delay.h> 16#include <linux/delay.h>
@@ -39,7 +40,10 @@ struct nodemgr_csr_info {
39 struct hpsb_host *host; 40 struct hpsb_host *host;
40 nodeid_t nodeid; 41 nodeid_t nodeid;
41 unsigned int generation; 42 unsigned int generation;
43
44 kmemcheck_bitfield_begin(flags);
42 unsigned int speed_unverified:1; 45 unsigned int speed_unverified:1;
46 kmemcheck_bitfield_end(flags);
43}; 47};
44 48
45 49
@@ -1293,6 +1297,7 @@ static void nodemgr_node_scan_one(struct hpsb_host *host,
1293 u8 *speed; 1297 u8 *speed;
1294 1298
1295 ci = kmalloc(sizeof(*ci), GFP_KERNEL); 1299 ci = kmalloc(sizeof(*ci), GFP_KERNEL);
1300 kmemcheck_annotate_bitfield(ci, flags);
1296 if (!ci) 1301 if (!ci)
1297 return; 1302 return;
1298 1303
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 0207dd59090d..b5346b4db91a 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,6 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/kmemcheck.h>
18#include <linux/ctype.h> 19#include <linux/ctype.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
20#include <linux/idr.h> 21#include <linux/idr.h>
@@ -891,6 +892,7 @@ struct c2port_device *c2port_device_register(char *name,
891 return ERR_PTR(-EINVAL); 892 return ERR_PTR(-EINVAL);
892 893
893 c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); 894 c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
895 kmemcheck_annotate_bitfield(c2dev, flags);
894 if (unlikely(!c2dev)) 896 if (unlikely(!c2dev))
895 return ERR_PTR(-ENOMEM); 897 return ERR_PTR(-ENOMEM);
896 898
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 7b5a2388ba67..2a5cd867c365 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/device.h> 12#include <linux/device.h>
13#include <linux/kmemcheck.h>
13 14
14#define C2PORT_NAME_LEN 32 15#define C2PORT_NAME_LEN 32
15 16
@@ -20,8 +21,10 @@
20/* Main struct */ 21/* Main struct */
21struct c2port_ops; 22struct c2port_ops;
22struct c2port_device { 23struct c2port_device {
24 kmemcheck_bitfield_begin(flags);
23 unsigned int access:1; 25 unsigned int access:1;
24 unsigned int flash_access:1; 26 unsigned int flash_access:1;
27 kmemcheck_bitfield_end(flags);
25 28
26 int id; 29 int id;
27 char name[C2PORT_NAME_LEN]; 30 char name[C2PORT_NAME_LEN];
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ede84fa7da5d..6d12174fbe11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1919,8 +1919,9 @@ extern void __init vfs_caches_init(unsigned long);
1919 1919
1920extern struct kmem_cache *names_cachep; 1920extern struct kmem_cache *names_cachep;
1921 1921
1922#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) 1922#define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp))
1923#define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) 1923#define __getname() __getname_gfp(GFP_KERNEL)
1924#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
1924#ifndef CONFIG_AUDITSYSCALL 1925#ifndef CONFIG_AUDITSYSCALL
1925#define putname(name) __putname(name) 1926#define putname(name) __putname(name)
1926#else 1927#else
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3760e7c5de02..80e14b8c2e78 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -52,7 +52,19 @@ struct vm_area_struct;
52#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ 52#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
53#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ 53#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
54 54
55#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ 55#ifdef CONFIG_KMEMCHECK
56#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */
57#else
58#define __GFP_NOTRACK ((__force gfp_t)0)
59#endif
60
61/*
62 * This may seem redundant, but it's a way of annotating false positives vs.
63 * allocations that simply cannot be supported (e.g. page tables).
64 */
65#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
66
67#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */
56#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) 68#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
57 69
58/* This equals 0, but use constants in case they ever change */ 70/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c41e812e9d5e..2721f07e9354 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -472,6 +472,20 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
472 __tasklet_hi_schedule(t); 472 __tasklet_hi_schedule(t);
473} 473}
474 474
475extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
476
477/*
478 * This version avoids touching any other tasklets. Needed for kmemcheck
479 * in order not to take any page faults while enqueueing this tasklet;
480 * consider VERY carefully whether you really need this or
481 * tasklet_hi_schedule()...
482 */
483static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
484{
485 if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
486 __tasklet_hi_schedule_first(t);
487}
488
475 489
476static inline void tasklet_disable_nosync(struct tasklet_struct *t) 490static inline void tasklet_disable_nosync(struct tasklet_struct *t)
477{ 491{
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
new file mode 100644
index 000000000000..47b39b7c7e84
--- /dev/null
+++ b/include/linux/kmemcheck.h
@@ -0,0 +1,153 @@
1#ifndef LINUX_KMEMCHECK_H
2#define LINUX_KMEMCHECK_H
3
4#include <linux/mm_types.h>
5#include <linux/types.h>
6
7#ifdef CONFIG_KMEMCHECK
8extern int kmemcheck_enabled;
9
10/* The slab-related functions. */
11void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
12void kmemcheck_free_shadow(struct page *page, int order);
13void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
14 size_t size);
15void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
16
17void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
18 gfp_t gfpflags);
19
20void kmemcheck_show_pages(struct page *p, unsigned int n);
21void kmemcheck_hide_pages(struct page *p, unsigned int n);
22
23bool kmemcheck_page_is_tracked(struct page *p);
24
25void kmemcheck_mark_unallocated(void *address, unsigned int n);
26void kmemcheck_mark_uninitialized(void *address, unsigned int n);
27void kmemcheck_mark_initialized(void *address, unsigned int n);
28void kmemcheck_mark_freed(void *address, unsigned int n);
29
30void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
31void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
32void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
33
34int kmemcheck_show_addr(unsigned long address);
35int kmemcheck_hide_addr(unsigned long address);
36
37#else
38#define kmemcheck_enabled 0
39
40static inline void
41kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
42{
43}
44
45static inline void
46kmemcheck_free_shadow(struct page *page, int order)
47{
48}
49
50static inline void
51kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
52 size_t size)
53{
54}
55
56static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
57 size_t size)
58{
59}
60
61static inline void kmemcheck_pagealloc_alloc(struct page *p,
62 unsigned int order, gfp_t gfpflags)
63{
64}
65
66static inline bool kmemcheck_page_is_tracked(struct page *p)
67{
68 return false;
69}
70
71static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
72{
73}
74
75static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
76{
77}
78
79static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
80{
81}
82
83static inline void kmemcheck_mark_freed(void *address, unsigned int n)
84{
85}
86
87static inline void kmemcheck_mark_unallocated_pages(struct page *p,
88 unsigned int n)
89{
90}
91
92static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
93 unsigned int n)
94{
95}
96
97static inline void kmemcheck_mark_initialized_pages(struct page *p,
98 unsigned int n)
99{
100}
101
102#endif /* CONFIG_KMEMCHECK */
103
104/*
105 * Bitfield annotations
106 *
107 * How to use: If you have a struct using bitfields, for example
108 *
109 * struct a {
110 * int x:8, y:8;
111 * };
112 *
113 * then this should be rewritten as
114 *
115 * struct a {
116 * kmemcheck_bitfield_begin(flags);
117 * int x:8, y:8;
118 * kmemcheck_bitfield_end(flags);
119 * };
120 *
121 * Now the "flags_begin" and "flags_end" members may be used to refer to the
122 * beginning and end, respectively, of the bitfield (and things like
123 * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
124 * fields should be annotated:
125 *
126 * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
127 * kmemcheck_annotate_bitfield(a, flags);
128 *
129 * Note: We provide the same definitions for both kmemcheck and non-
130 * kmemcheck kernels. This makes it harder to introduce accidental errors. It
131 * is also allowed to pass NULL pointers to kmemcheck_annotate_bitfield().
132 */
133#define kmemcheck_bitfield_begin(name) \
134 int name##_begin[0];
135
136#define kmemcheck_bitfield_end(name) \
137 int name##_end[0];
138
139#define kmemcheck_annotate_bitfield(ptr, name) \
140 do if (ptr) { \
141 int _n = (long) &((ptr)->name##_end) \
142 - (long) &((ptr)->name##_begin); \
143 BUILD_BUG_ON(_n < 0); \
144 \
145 kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \
146 } while (0)
147
148#define kmemcheck_annotate_variable(var) \
149 do { \
150 kmemcheck_mark_initialized(&(var), sizeof(var)); \
151 } while (0) \
152
153#endif /* LINUX_KMEMCHECK_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26ecf21..0042090a4d70 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -98,6 +98,14 @@ struct page {
98#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS 98#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
99 unsigned long debug_flags; /* Use atomic bitops on this */ 99 unsigned long debug_flags; /* Use atomic bitops on this */
100#endif 100#endif
101
102#ifdef CONFIG_KMEMCHECK
103 /*
104 * kmemcheck wants to track the status of each byte in a page; this
105 * is a pointer to such a status block. NULL if not tracked.
106 */
107 void *shadow;
108#endif
101}; 109};
102 110
103/* 111/*
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8670f1575fe1..29f8599e6bea 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -1,6 +1,7 @@
1#ifndef _LINUX_RING_BUFFER_H 1#ifndef _LINUX_RING_BUFFER_H
2#define _LINUX_RING_BUFFER_H 2#define _LINUX_RING_BUFFER_H
3 3
4#include <linux/kmemcheck.h>
4#include <linux/mm.h> 5#include <linux/mm.h>
5#include <linux/seq_file.h> 6#include <linux/seq_file.h>
6 7
@@ -11,7 +12,10 @@ struct ring_buffer_iter;
11 * Don't refer to this struct directly, use functions below. 12 * Don't refer to this struct directly, use functions below.
12 */ 13 */
13struct ring_buffer_event { 14struct ring_buffer_event {
15 kmemcheck_bitfield_begin(bitfield);
14 u32 type_len:5, time_delta:27; 16 u32 type_len:5, time_delta:27;
17 kmemcheck_bitfield_end(bitfield);
18
15 u32 array[]; 19 u32 array[];
16}; 20};
17 21
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fa51293f2708..63ef24bc01d0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,6 +15,7 @@
15#define _LINUX_SKBUFF_H 15#define _LINUX_SKBUFF_H
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/kmemcheck.h>
18#include <linux/compiler.h> 19#include <linux/compiler.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include <linux/cache.h> 21#include <linux/cache.h>
@@ -343,6 +344,7 @@ struct sk_buff {
343 }; 344 };
344 }; 345 };
345 __u32 priority; 346 __u32 priority;
347 kmemcheck_bitfield_begin(flags1);
346 __u8 local_df:1, 348 __u8 local_df:1,
347 cloned:1, 349 cloned:1,
348 ip_summed:2, 350 ip_summed:2,
@@ -353,6 +355,7 @@ struct sk_buff {
353 ipvs_property:1, 355 ipvs_property:1,
354 peeked:1, 356 peeked:1,
355 nf_trace:1; 357 nf_trace:1;
358 kmemcheck_bitfield_end(flags1);
356 __be16 protocol; 359 __be16 protocol;
357 360
358 void (*destructor)(struct sk_buff *skb); 361 void (*destructor)(struct sk_buff *skb);
@@ -372,12 +375,16 @@ struct sk_buff {
372 __u16 tc_verd; /* traffic control verdict */ 375 __u16 tc_verd; /* traffic control verdict */
373#endif 376#endif
374#endif 377#endif
378
379 kmemcheck_bitfield_begin(flags2);
375#ifdef CONFIG_IPV6_NDISC_NODETYPE 380#ifdef CONFIG_IPV6_NDISC_NODETYPE
376 __u8 ndisc_nodetype:2; 381 __u8 ndisc_nodetype:2;
377#endif 382#endif
378#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) 383#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
379 __u8 do_not_encrypt:1; 384 __u8 do_not_encrypt:1;
380#endif 385#endif
386 kmemcheck_bitfield_end(flags2);
387
381 /* 0/13/14 bit hole */ 388 /* 0/13/14 bit hole */
382 389
383#ifdef CONFIG_NET_DMA 390#ifdef CONFIG_NET_DMA
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 219b8fb4651d..2da8372519f5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -64,6 +64,13 @@
64 64
65#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ 65#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */
66 66
67/* Don't track use of uninitialized memory */
68#ifdef CONFIG_KMEMCHECK
69# define SLAB_NOTRACK 0x01000000UL
70#else
71# define SLAB_NOTRACK 0x00000000UL
72#endif
73
67/* The following flags affect the page allocator grouping pages by mobility */ 74/* The following flags affect the page allocator grouping pages by mobility */
68#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 75#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
69#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 76#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 713f841ecaa9..850d057500de 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -16,6 +16,87 @@
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/kmemtrace.h> 17#include <linux/kmemtrace.h>
18 18
19/*
20 * struct kmem_cache
21 *
22 * manages a cache.
23 */
24
25struct kmem_cache {
26/* 1) per-cpu data, touched during every alloc/free */
27 struct array_cache *array[NR_CPUS];
28/* 2) Cache tunables. Protected by cache_chain_mutex */
29 unsigned int batchcount;
30 unsigned int limit;
31 unsigned int shared;
32
33 unsigned int buffer_size;
34 u32 reciprocal_buffer_size;
35/* 3) touched by every alloc & free from the backend */
36
37 unsigned int flags; /* constant flags */
38 unsigned int num; /* # of objs per slab */
39
40/* 4) cache_grow/shrink */
41 /* order of pgs per slab (2^n) */
42 unsigned int gfporder;
43
44 /* force GFP flags, e.g. GFP_DMA */
45 gfp_t gfpflags;
46
47 size_t colour; /* cache colouring range */
48 unsigned int colour_off; /* colour offset */
49 struct kmem_cache *slabp_cache;
50 unsigned int slab_size;
51 unsigned int dflags; /* dynamic flags */
52
53 /* constructor func */
54 void (*ctor)(void *obj);
55
56/* 5) cache creation/removal */
57 const char *name;
58 struct list_head next;
59
60/* 6) statistics */
61#ifdef CONFIG_DEBUG_SLAB
62 unsigned long num_active;
63 unsigned long num_allocations;
64 unsigned long high_mark;
65 unsigned long grown;
66 unsigned long reaped;
67 unsigned long errors;
68 unsigned long max_freeable;
69 unsigned long node_allocs;
70 unsigned long node_frees;
71 unsigned long node_overflow;
72 atomic_t allochit;
73 atomic_t allocmiss;
74 atomic_t freehit;
75 atomic_t freemiss;
76
77 /*
78 * If debugging is enabled, then the allocator can add additional
79 * fields and/or padding to every object. buffer_size contains the total
80 * object size including these internal fields, the following two
81 * variables contain the offset to the user object and its size.
82 */
83 int obj_offset;
84 int obj_size;
85#endif /* CONFIG_DEBUG_SLAB */
86
87 /*
88 * We put nodelists[] at the end of kmem_cache, because we want to size
89 * this array to nr_node_ids slots instead of MAX_NUMNODES
90 * (see kmem_cache_init())
91 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
92 * is statically defined, so we reserve the max number of nodes.
93 */
94 struct kmem_list3 *nodelists[MAX_NUMNODES];
95 /*
96 * Do not add fields after nodelists[]
97 */
98};
99
19/* Size description struct for general caches. */ 100/* Size description struct for general caches. */
20struct cache_sizes { 101struct cache_sizes {
21 size_t cs_size; 102 size_t cs_size;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 1a8cecc4f38c..51efbef38fb0 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -4,6 +4,8 @@
4struct task_struct; 4struct task_struct;
5 5
6#ifdef CONFIG_STACKTRACE 6#ifdef CONFIG_STACKTRACE
7struct task_struct;
8
7struct stack_trace { 9struct stack_trace {
8 unsigned int nr_entries, max_entries; 10 unsigned int nr_entries, max_entries;
9 unsigned long *entries; 11 unsigned long *entries;
@@ -11,6 +13,7 @@ struct stack_trace {
11}; 13};
12 14
13extern void save_stack_trace(struct stack_trace *trace); 15extern void save_stack_trace(struct stack_trace *trace);
16extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
14extern void save_stack_trace_tsk(struct task_struct *tsk, 17extern void save_stack_trace_tsk(struct task_struct *tsk,
15 struct stack_trace *trace); 18 struct stack_trace *trace);
16 19
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 20a6957af870..47004f35cc7e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,6 +17,7 @@
17#define _INET_SOCK_H 17#define _INET_SOCK_H
18 18
19 19
20#include <linux/kmemcheck.h>
20#include <linux/string.h> 21#include <linux/string.h>
21#include <linux/types.h> 22#include <linux/types.h>
22#include <linux/jhash.h> 23#include <linux/jhash.h>
@@ -66,14 +67,16 @@ struct inet_request_sock {
66 __be32 loc_addr; 67 __be32 loc_addr;
67 __be32 rmt_addr; 68 __be32 rmt_addr;
68 __be16 rmt_port; 69 __be16 rmt_port;
69 u16 snd_wscale : 4, 70 kmemcheck_bitfield_begin(flags);
70 rcv_wscale : 4, 71 u16 snd_wscale : 4,
72 rcv_wscale : 4,
71 tstamp_ok : 1, 73 tstamp_ok : 1,
72 sack_ok : 1, 74 sack_ok : 1,
73 wscale_ok : 1, 75 wscale_ok : 1,
74 ecn_ok : 1, 76 ecn_ok : 1,
75 acked : 1, 77 acked : 1,
76 no_srccheck: 1; 78 no_srccheck: 1;
79 kmemcheck_bitfield_end(flags);
77 struct ip_options *opt; 80 struct ip_options *opt;
78}; 81};
79 82
@@ -199,9 +202,12 @@ static inline int inet_sk_ehashfn(const struct sock *sk)
199static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) 202static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops)
200{ 203{
201 struct request_sock *req = reqsk_alloc(ops); 204 struct request_sock *req = reqsk_alloc(ops);
205 struct inet_request_sock *ireq = inet_rsk(req);
202 206
203 if (req != NULL) 207 if (req != NULL) {
204 inet_rsk(req)->opt = NULL; 208 kmemcheck_annotate_bitfield(ireq, flags);
209 ireq->opt = NULL;
210 }
205 211
206 return req; 212 return req;
207} 213}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4b8ece22b8e9..b63b80fac567 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -16,6 +16,7 @@
16#define _INET_TIMEWAIT_SOCK_ 16#define _INET_TIMEWAIT_SOCK_
17 17
18 18
19#include <linux/kmemcheck.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/timer.h> 22#include <linux/timer.h>
@@ -127,10 +128,12 @@ struct inet_timewait_sock {
127 __be32 tw_rcv_saddr; 128 __be32 tw_rcv_saddr;
128 __be16 tw_dport; 129 __be16 tw_dport;
129 __u16 tw_num; 130 __u16 tw_num;
131 kmemcheck_bitfield_begin(flags);
130 /* And these are ours. */ 132 /* And these are ours. */
131 __u8 tw_ipv6only:1, 133 __u8 tw_ipv6only:1,
132 tw_transparent:1; 134 tw_transparent:1;
133 /* 15 bits hole, try to pack */ 135 /* 14 bits hole, try to pack */
136 kmemcheck_bitfield_end(flags);
134 __u16 tw_ipv6_offset; 137 __u16 tw_ipv6_offset;
135 unsigned long tw_ttd; 138 unsigned long tw_ttd;
136 struct inet_bind_bucket *tw_tb; 139 struct inet_bind_bucket *tw_tb;
diff --git a/include/net/sock.h b/include/net/sock.h
index 010e14a93c92..95bd3fd75f94 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -218,9 +218,11 @@ struct sock {
218#define sk_hash __sk_common.skc_hash 218#define sk_hash __sk_common.skc_hash
219#define sk_prot __sk_common.skc_prot 219#define sk_prot __sk_common.skc_prot
220#define sk_net __sk_common.skc_net 220#define sk_net __sk_common.skc_net
221 kmemcheck_bitfield_begin(flags);
221 unsigned char sk_shutdown : 2, 222 unsigned char sk_shutdown : 2,
222 sk_no_check : 2, 223 sk_no_check : 2,
223 sk_userlocks : 4; 224 sk_userlocks : 4;
225 kmemcheck_bitfield_end(flags);
224 unsigned char sk_protocol; 226 unsigned char sk_protocol;
225 unsigned short sk_type; 227 unsigned short sk_type;
226 int sk_rcvbuf; 228 int sk_rcvbuf;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dd7ee5f203f3..093f65915501 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -231,7 +231,8 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
231 231
232void __init mount_block_root(char *name, int flags) 232void __init mount_block_root(char *name, int flags)
233{ 233{
234 char *fs_names = __getname(); 234 char *fs_names = __getname_gfp(GFP_KERNEL
235 | __GFP_NOTRACK_FALSE_POSITIVE);
235 char *p; 236 char *p;
236#ifdef CONFIG_BLOCK 237#ifdef CONFIG_BLOCK
237 char b[BDEVNAME_SIZE]; 238 char b[BDEVNAME_SIZE];
diff --git a/init/main.c b/init/main.c
index f6204f712e7c..7becd8b5c5bf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -65,6 +65,7 @@
65#include <linux/idr.h> 65#include <linux/idr.h>
66#include <linux/ftrace.h> 66#include <linux/ftrace.h>
67#include <linux/async.h> 67#include <linux/async.h>
68#include <linux/kmemcheck.h>
68#include <linux/kmemtrace.h> 69#include <linux/kmemtrace.h>
69#include <trace/boot.h> 70#include <trace/boot.h>
70 71
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f2..be022c200da6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
178 /* create a slab on which task_structs can be allocated */ 178 /* create a slab on which task_structs can be allocated */
179 task_struct_cachep = 179 task_struct_cachep =
180 kmem_cache_create("task_struct", sizeof(struct task_struct), 180 kmem_cache_create("task_struct", sizeof(struct task_struct),
181 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 181 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
182#endif 182#endif
183 183
184 /* do the arch specific task caches init */ 184 /* do the arch specific task caches init */
@@ -1470,20 +1470,20 @@ void __init proc_caches_init(void)
1470{ 1470{
1471 sighand_cachep = kmem_cache_create("sighand_cache", 1471 sighand_cachep = kmem_cache_create("sighand_cache",
1472 sizeof(struct sighand_struct), 0, 1472 sizeof(struct sighand_struct), 0,
1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1474 sighand_ctor); 1474 SLAB_NOTRACK, sighand_ctor);
1475 signal_cachep = kmem_cache_create("signal_cache", 1475 signal_cachep = kmem_cache_create("signal_cache",
1476 sizeof(struct signal_struct), 0, 1476 sizeof(struct signal_struct), 0,
1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1478 files_cachep = kmem_cache_create("files_cache", 1478 files_cachep = kmem_cache_create("files_cache",
1479 sizeof(struct files_struct), 0, 1479 sizeof(struct files_struct), 0,
1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1481 fs_cachep = kmem_cache_create("fs_cache", 1481 fs_cachep = kmem_cache_create("fs_cache",
1482 sizeof(struct fs_struct), 0, 1482 sizeof(struct fs_struct), 0,
1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1484 mm_cachep = kmem_cache_create("mm_struct", 1484 mm_cachep = kmem_cache_create("mm_struct",
1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1488 mmap_init(); 1488 mmap_init();
1489} 1489}
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
832{ 832{
833 struct sigpending *pending; 833 struct sigpending *pending;
834 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
835 836
836 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
837 838
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
863 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
864 pass on the info struct. */ 865 pass on the info struct. */
865 866
866 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
867 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
868 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
869 if (q) { 874 if (q) {
870 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
871 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 382
383EXPORT_SYMBOL(__tasklet_hi_schedule); 383EXPORT_SYMBOL(__tasklet_hi_schedule);
384 384
385void __tasklet_hi_schedule_first(struct tasklet_struct *t)
386{
387 BUG_ON(!irqs_disabled());
388
389 t->next = __get_cpu_var(tasklet_hi_vec).head;
390 __get_cpu_var(tasklet_hi_vec).head = t;
391 __raise_softirq_irqoff(HI_SOFTIRQ);
392}
393
394EXPORT_SYMBOL(__tasklet_hi_schedule_first);
395
385static void tasklet_action(struct softirq_action *a) 396static void tasklet_action(struct softirq_action *a)
386{ 397{
387 struct tasklet_struct *list; 398 struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0e51a35a4486..f5c76b6cd616 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -967,6 +968,17 @@ static struct ctl_table kern_table[] = {
967 .proc_handler = &proc_dointvec, 968 .proc_handler = &proc_dointvec,
968 }, 969 },
969#endif 970#endif
971#ifdef CONFIG_KMEMCHECK
972 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "kmemcheck",
975 .data = &kmemcheck_enabled,
976 .maxlen = sizeof(int),
977 .mode = 0644,
978 .proc_handler = &proc_dointvec,
979 },
980#endif
981
970/* 982/*
971 * NOTE: do not add new entries to this table unless you have read 983 * NOTE: do not add new entries to this table unless you have read
972 * Documentation/sysctl/ctl_unnumbered.txt 984 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b7253..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1270 if (tail < BUF_PAGE_SIZE) { 1271 if (tail < BUF_PAGE_SIZE) {
1271 /* Mark the rest of the page with padding */ 1272 /* Mark the rest of the page with padding */
1272 event = __rb_page_index(tail_page, tail); 1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1273 rb_event_set_padding(event); 1275 rb_event_set_padding(event);
1274 } 1276 }
1275 1277
@@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1327 return NULL; 1329 return NULL;
1328 1330
1329 event = __rb_page_index(tail_page, tail); 1331 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield);
1330 rb_update_event(event, type, length); 1333 rb_update_event(event, type, length);
1331 1334
1332 /* The passed in type is zero for DATA */ 1335 /* The passed in type is zero for DATA */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 116a35051be6..6b0c2d8a2129 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -300,7 +300,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
300 300
301config DEBUG_SLAB 301config DEBUG_SLAB
302 bool "Debug slab memory allocations" 302 bool "Debug slab memory allocations"
303 depends on DEBUG_KERNEL && SLAB 303 depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
304 help 304 help
305 Say Y here to have the kernel do limited verification on memory 305 Say Y here to have the kernel do limited verification on memory
306 allocation as well as poisoning memory on free to catch use of freed 306 allocation as well as poisoning memory on free to catch use of freed
@@ -312,7 +312,7 @@ config DEBUG_SLAB_LEAK
312 312
313config SLUB_DEBUG_ON 313config SLUB_DEBUG_ON
314 bool "SLUB debugging on by default" 314 bool "SLUB debugging on by default"
315 depends on SLUB && SLUB_DEBUG 315 depends on SLUB && SLUB_DEBUG && !KMEMCHECK
316 default n 316 default n
317 help 317 help
318 Boot with debugging on by default. SLUB boots by default with 318 Boot with debugging on by default. SLUB boots by default with
@@ -996,3 +996,5 @@ config DMA_API_DEBUG
996source "samples/Kconfig" 996source "samples/Kconfig"
997 997
998source "lib/Kconfig.kgdb" 998source "lib/Kconfig.kgdb"
999
1000source "lib/Kconfig.kmemcheck"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
new file mode 100644
index 000000000000..603c81b66549
--- /dev/null
+++ b/lib/Kconfig.kmemcheck
@@ -0,0 +1,91 @@
1config HAVE_ARCH_KMEMCHECK
2 bool
3
4menuconfig KMEMCHECK
5 bool "kmemcheck: trap use of uninitialized memory"
6 depends on DEBUG_KERNEL
7 depends on !X86_USE_3DNOW
8 depends on SLUB || SLAB
9 depends on !CC_OPTIMIZE_FOR_SIZE
10 depends on !FUNCTION_TRACER
11 select FRAME_POINTER
12 select STACKTRACE
13 default n
14 help
15 This option enables tracing of dynamically allocated kernel memory
16 to see if memory is used before it has been given an initial value.
17 Be aware that this requires half of your memory for bookkeeping and
18 will insert extra code at *every* read and write to tracked memory
19 thus slow down the kernel code (but user code is unaffected).
20
21 The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
22 or enable kmemcheck at boot-time. If the kernel is started with
23 kmemcheck=0, the large memory and CPU overhead is not incurred.
24
25choice
26 prompt "kmemcheck: default mode at boot"
27 depends on KMEMCHECK
28 default KMEMCHECK_ONESHOT_BY_DEFAULT
29 help
30 This option controls the default behaviour of kmemcheck when the
31 kernel boots and no kmemcheck= parameter is given.
32
33config KMEMCHECK_DISABLED_BY_DEFAULT
34 bool "disabled"
35 depends on KMEMCHECK
36
37config KMEMCHECK_ENABLED_BY_DEFAULT
38 bool "enabled"
39 depends on KMEMCHECK
40
41config KMEMCHECK_ONESHOT_BY_DEFAULT
42 bool "one-shot"
43 depends on KMEMCHECK
44 help
45 In one-shot mode, only the first error detected is reported before
46 kmemcheck is disabled.
47
48endchoice
49
50config KMEMCHECK_QUEUE_SIZE
51 int "kmemcheck: error queue size"
52 depends on KMEMCHECK
53 default 64
54 help
55 Select the maximum number of errors to store in the queue. Since
56 errors can occur virtually anywhere and in any context, we need a
57 temporary storage area which is guarantueed not to generate any
58 other faults. The queue will be emptied as soon as a tasklet may
59 be scheduled. If the queue is full, new error reports will be
60 lost.
61
62config KMEMCHECK_SHADOW_COPY_SHIFT
63 int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
64 depends on KMEMCHECK
65 range 2 8
66 default 5
67 help
68 Select the number of shadow bytes to save along with each entry of
69 the queue. These bytes indicate what parts of an allocation are
70 initialized, uninitialized, etc. and will be displayed when an
71 error is detected to help the debugging of a particular problem.
72
73config KMEMCHECK_PARTIAL_OK
74 bool "kmemcheck: allow partially uninitialized memory"
75 depends on KMEMCHECK
76 default y
77 help
78 This option works around certain GCC optimizations that produce
79 32-bit reads from 16-bit variables where the upper 16 bits are
80 thrown away afterwards. This may of course also hide some real
81 bugs.
82
83config KMEMCHECK_BITOPS_OK
84 bool "kmemcheck: allow bit-field manipulation"
85 depends on KMEMCHECK
86 default n
87 help
88 This option silences warnings that would be generated for bit-field
89 accesses where not all the bits are initialized at the same time.
90 This may also hide some real bugs.
91
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index bb01e298f260..aa99fd1f7109 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || !PPC && !SPARC
5 depends on !KMEMCHECK
5 ---help--- 6 ---help---
6 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
diff --git a/mm/Makefile b/mm/Makefile
index e89acb090b4d..c379ce08354a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
28obj-$(CONFIG_SLAB) += slab.o 28obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 29obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
30obj-$(CONFIG_FAILSLAB) += failslab.o 31obj-$(CONFIG_FAILSLAB) += failslab.o
31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 32obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
32obj-$(CONFIG_FS_XIP) += filemap_xip.o 33obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
1#include <linux/gfp.h>
2#include <linux/mm_types.h>
3#include <linux/mm.h>
4#include <linux/slab.h>
5#include <linux/kmemcheck.h>
6
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
8{
9 struct page *shadow;
10 int pages;
11 int i;
12
13 pages = 1 << order;
14
15 /*
16 * With kmemcheck enabled, we need to allocate a memory area for the
17 * shadow bits as well.
18 */
19 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
20 if (!shadow) {
21 if (printk_ratelimit())
22 printk(KERN_ERR "kmemcheck: failed to allocate "
23 "shadow bitmap\n");
24 return;
25 }
26
27 for(i = 0; i < pages; ++i)
28 page[i].shadow = page_address(&shadow[i]);
29
30 /*
31 * Mark it as non-present for the MMU so that our accesses to
32 * this memory will trigger a page fault and let us analyze
33 * the memory accesses.
34 */
35 kmemcheck_hide_pages(page, pages);
36}
37
38void kmemcheck_free_shadow(struct page *page, int order)
39{
40 struct page *shadow;
41 int pages;
42 int i;
43
44 if (!kmemcheck_page_is_tracked(page))
45 return;
46
47 pages = 1 << order;
48
49 kmemcheck_show_pages(page, pages);
50
51 shadow = virt_to_page(page[0].shadow);
52
53 for(i = 0; i < pages; ++i)
54 page[i].shadow = NULL;
55
56 __free_pages(shadow, order);
57}
58
59void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
60 size_t size)
61{
62 /*
63 * Has already been memset(), which initializes the shadow for us
64 * as well.
65 */
66 if (gfpflags & __GFP_ZERO)
67 return;
68
69 /* No need to initialize the shadow of a non-tracked slab. */
70 if (s->flags & SLAB_NOTRACK)
71 return;
72
73 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
74 /*
75 * Allow notracked objects to be allocated from
76 * tracked caches. Note however that these objects
77 * will still get page faults on access, they just
78 * won't ever be flagged as uninitialized. If page
79 * faults are not acceptable, the slab cache itself
80 * should be marked NOTRACK.
81 */
82 kmemcheck_mark_initialized(object, size);
83 } else if (!s->ctor) {
84 /*
85 * New objects should be marked uninitialized before
86 * they're returned to the called.
87 */
88 kmemcheck_mark_uninitialized(object, size);
89 }
90}
91
92void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
93{
94 /* TODO: RCU freeing is unsupported for now; hide false positives. */
95 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
96 kmemcheck_mark_freed(object, size);
97}
98
99void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
100 gfp_t gfpflags)
101{
102 int pages;
103
104 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
105 return;
106
107 pages = 1 << order;
108
109 /*
110 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
111 * can become uninitialized by copying uninitialized memory
112 * into them.
113 */
114
115 /* XXX: Can use zone->node for node? */
116 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
117
118 if (gfpflags & __GFP_ZERO)
119 kmemcheck_mark_initialized_pages(page, pages);
120 else
121 kmemcheck_mark_uninitialized_pages(page, pages);
122}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..0727896a88ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -546,6 +547,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
546 int i; 547 int i;
547 int bad = 0; 548 int bad = 0;
548 549
550 kmemcheck_free_shadow(page, order);
551
549 for (i = 0 ; i < (1 << order) ; ++i) 552 for (i = 0 ; i < (1 << order) ; ++i)
550 bad += free_pages_check(page + i); 553 bad += free_pages_check(page + i);
551 if (bad) 554 if (bad)
@@ -994,6 +997,8 @@ static void free_hot_cold_page(struct page *page, int cold)
994 struct per_cpu_pages *pcp; 997 struct per_cpu_pages *pcp;
995 unsigned long flags; 998 unsigned long flags;
996 999
1000 kmemcheck_free_shadow(page, 0);
1001
997 if (PageAnon(page)) 1002 if (PageAnon(page))
998 page->mapping = NULL; 1003 page->mapping = NULL;
999 if (free_pages_check(page)) 1004 if (free_pages_check(page))
@@ -1047,6 +1052,16 @@ void split_page(struct page *page, unsigned int order)
1047 1052
1048 VM_BUG_ON(PageCompound(page)); 1053 VM_BUG_ON(PageCompound(page));
1049 VM_BUG_ON(!page_count(page)); 1054 VM_BUG_ON(!page_count(page));
1055
1056#ifdef CONFIG_KMEMCHECK
1057 /*
1058 * Split shadow pages too, because free(page[0]) would
1059 * otherwise free the whole shadow.
1060 */
1061 if (kmemcheck_page_is_tracked(page))
1062 split_page(virt_to_page(page[0].shadow), order);
1063#endif
1064
1050 for (i = 1; i < (1 << order); i++) 1065 for (i = 1; i < (1 << order); i++)
1051 set_page_refcounted(page + i); 1066 set_page_refcounted(page + i);
1052} 1067}
@@ -1667,7 +1682,10 @@ nopage:
1667 dump_stack(); 1682 dump_stack();
1668 show_mem(); 1683 show_mem();
1669 } 1684 }
1685 return page;
1670got_pg: 1686got_pg:
1687 if (kmemcheck_enabled)
1688 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1671 return page; 1689 return page;
1672} 1690}
1673EXPORT_SYMBOL(__alloc_pages_internal); 1691EXPORT_SYMBOL(__alloc_pages_internal);
diff --git a/mm/slab.c b/mm/slab.c
index 18e3164de09a..af3376d0a833 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,6 +114,7 @@
114#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
117 118
118#include <asm/cacheflush.h> 119#include <asm/cacheflush.h>
119#include <asm/tlbflush.h> 120#include <asm/tlbflush.h>
@@ -179,13 +180,13 @@
179 SLAB_STORE_USER | \ 180 SLAB_STORE_USER | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
182 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
183#else 184#else
184# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
185 SLAB_CACHE_DMA | \ 186 SLAB_CACHE_DMA | \
186 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
187 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
188 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
189#endif 190#endif
190 191
191/* 192/*
@@ -380,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
380 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 381 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
381 } while (0) 382 } while (0)
382 383
383/*
384 * struct kmem_cache
385 *
386 * manages a cache.
387 */
388
389struct kmem_cache {
390/* 1) per-cpu data, touched during every alloc/free */
391 struct array_cache *array[NR_CPUS];
392/* 2) Cache tunables. Protected by cache_chain_mutex */
393 unsigned int batchcount;
394 unsigned int limit;
395 unsigned int shared;
396
397 unsigned int buffer_size;
398 u32 reciprocal_buffer_size;
399/* 3) touched by every alloc & free from the backend */
400
401 unsigned int flags; /* constant flags */
402 unsigned int num; /* # of objs per slab */
403
404/* 4) cache_grow/shrink */
405 /* order of pgs per slab (2^n) */
406 unsigned int gfporder;
407
408 /* force GFP flags, e.g. GFP_DMA */
409 gfp_t gfpflags;
410
411 size_t colour; /* cache colouring range */
412 unsigned int colour_off; /* colour offset */
413 struct kmem_cache *slabp_cache;
414 unsigned int slab_size;
415 unsigned int dflags; /* dynamic flags */
416
417 /* constructor func */
418 void (*ctor)(void *obj);
419
420/* 5) cache creation/removal */
421 const char *name;
422 struct list_head next;
423
424/* 6) statistics */
425#if STATS
426 unsigned long num_active;
427 unsigned long num_allocations;
428 unsigned long high_mark;
429 unsigned long grown;
430 unsigned long reaped;
431 unsigned long errors;
432 unsigned long max_freeable;
433 unsigned long node_allocs;
434 unsigned long node_frees;
435 unsigned long node_overflow;
436 atomic_t allochit;
437 atomic_t allocmiss;
438 atomic_t freehit;
439 atomic_t freemiss;
440#endif
441#if DEBUG
442 /*
443 * If debugging is enabled, then the allocator can add additional
444 * fields and/or padding to every object. buffer_size contains the total
445 * object size including these internal fields, the following two
446 * variables contain the offset to the user object and its size.
447 */
448 int obj_offset;
449 int obj_size;
450#endif
451 /*
452 * We put nodelists[] at the end of kmem_cache, because we want to size
453 * this array to nr_node_ids slots instead of MAX_NUMNODES
454 * (see kmem_cache_init())
455 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
456 * is statically defined, so we reserve the max number of nodes.
457 */
458 struct kmem_list3 *nodelists[MAX_NUMNODES];
459 /*
460 * Do not add fields after nodelists[]
461 */
462};
463
464#define CFLGS_OFF_SLAB (0x80000000UL) 384#define CFLGS_OFF_SLAB (0x80000000UL)
465#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 385#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
466 386
@@ -1707,7 +1627,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1707 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1627 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1708 flags |= __GFP_RECLAIMABLE; 1628 flags |= __GFP_RECLAIMABLE;
1709 1629
1710 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1630 page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1711 if (!page) 1631 if (!page)
1712 return NULL; 1632 return NULL;
1713 1633
@@ -1720,6 +1640,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1720 NR_SLAB_UNRECLAIMABLE, nr_pages); 1640 NR_SLAB_UNRECLAIMABLE, nr_pages);
1721 for (i = 0; i < nr_pages; i++) 1641 for (i = 0; i < nr_pages; i++)
1722 __SetPageSlab(page + i); 1642 __SetPageSlab(page + i);
1643
1644 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1645 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1646
1647 if (cachep->ctor)
1648 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1649 else
1650 kmemcheck_mark_unallocated_pages(page, nr_pages);
1651 }
1652
1723 return page_address(page); 1653 return page_address(page);
1724} 1654}
1725 1655
@@ -1732,6 +1662,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1732 struct page *page = virt_to_page(addr); 1662 struct page *page = virt_to_page(addr);
1733 const unsigned long nr_freed = i; 1663 const unsigned long nr_freed = i;
1734 1664
1665 kmemcheck_free_shadow(page, cachep->gfporder);
1666
1735 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1667 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1736 sub_zone_page_state(page_zone(page), 1668 sub_zone_page_state(page_zone(page),
1737 NR_SLAB_RECLAIMABLE, nr_freed); 1669 NR_SLAB_RECLAIMABLE, nr_freed);
@@ -3407,6 +3339,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3407 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3339 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3408 flags); 3340 flags);
3409 3341
3342 if (likely(ptr))
3343 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3344
3410 if (unlikely((flags & __GFP_ZERO) && ptr)) 3345 if (unlikely((flags & __GFP_ZERO) && ptr))
3411 memset(ptr, 0, obj_size(cachep)); 3346 memset(ptr, 0, obj_size(cachep));
3412 3347
@@ -3467,6 +3402,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3467 flags); 3402 flags);
3468 prefetchw(objp); 3403 prefetchw(objp);
3469 3404
3405 if (likely(objp))
3406 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3407
3470 if (unlikely((flags & __GFP_ZERO) && objp)) 3408 if (unlikely((flags & __GFP_ZERO) && objp))
3471 memset(objp, 0, obj_size(cachep)); 3409 memset(objp, 0, obj_size(cachep));
3472 3410
@@ -3583,6 +3521,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3583 kmemleak_free_recursive(objp, cachep->flags); 3521 kmemleak_free_recursive(objp, cachep->flags);
3584 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3522 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3585 3523
3524 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3525
3586 /* 3526 /*
3587 * Skip calling cache_free_alien() when the platform is not numa. 3527 * Skip calling cache_free_alien() when the platform is not numa.
3588 * This will avoid cache misses that happen while accessing slabp (which 3528 * This will avoid cache misses that happen while accessing slabp (which
diff --git a/mm/slub.c b/mm/slub.c
index 30354bfeb43d..15960a09abb1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,6 +18,7 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/kmemtrace.h> 20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h>
21#include <linux/cpu.h> 22#include <linux/cpu.h>
22#include <linux/cpuset.h> 23#include <linux/cpuset.h>
23#include <linux/kmemleak.h> 24#include <linux/kmemleak.h>
@@ -147,7 +148,7 @@
147 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 148 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
148 149
149#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 150#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
150 SLAB_CACHE_DMA) 151 SLAB_CACHE_DMA | SLAB_NOTRACK)
151 152
152#ifndef ARCH_KMALLOC_MINALIGN 153#ifndef ARCH_KMALLOC_MINALIGN
153#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 154#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -1071,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1071{ 1072{
1072 int order = oo_order(oo); 1073 int order = oo_order(oo);
1073 1074
1075 flags |= __GFP_NOTRACK;
1076
1074 if (node == -1) 1077 if (node == -1)
1075 return alloc_pages(flags, order); 1078 return alloc_pages(flags, order);
1076 else 1079 else
@@ -1098,6 +1101,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1098 1101
1099 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1102 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1100 } 1103 }
1104
1105 if (kmemcheck_enabled
1106 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
1107 {
1108 int pages = 1 << oo_order(oo);
1109
1110 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1111
1112 /*
1113 * Objects from caches that have a constructor don't get
1114 * cleared when they're allocated, so we need to do it here.
1115 */
1116 if (s->ctor)
1117 kmemcheck_mark_uninitialized_pages(page, pages);
1118 else
1119 kmemcheck_mark_unallocated_pages(page, pages);
1120 }
1121
1101 page->objects = oo_objects(oo); 1122 page->objects = oo_objects(oo);
1102 mod_zone_page_state(page_zone(page), 1123 mod_zone_page_state(page_zone(page),
1103 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1124 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1171,6 +1192,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1171 __ClearPageSlubDebug(page); 1192 __ClearPageSlubDebug(page);
1172 } 1193 }
1173 1194
1195 kmemcheck_free_shadow(page, compound_order(page));
1196
1174 mod_zone_page_state(page_zone(page), 1197 mod_zone_page_state(page_zone(page),
1175 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1198 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1176 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1199 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1626,7 +1649,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1626 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1649 if (unlikely((gfpflags & __GFP_ZERO) && object))
1627 memset(object, 0, objsize); 1650 memset(object, 0, objsize);
1628 1651
1652 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1629 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1653 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1654
1630 return object; 1655 return object;
1631} 1656}
1632 1657
@@ -1759,6 +1784,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
1759 kmemleak_free_recursive(x, s->flags); 1784 kmemleak_free_recursive(x, s->flags);
1760 local_irq_save(flags); 1785 local_irq_save(flags);
1761 c = get_cpu_slab(s, smp_processor_id()); 1786 c = get_cpu_slab(s, smp_processor_id());
1787 kmemcheck_slab_free(s, object, c->objsize);
1762 debug_check_no_locks_freed(object, c->objsize); 1788 debug_check_no_locks_freed(object, c->objsize);
1763 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1789 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1764 debug_check_no_obj_freed(object, c->objsize); 1790 debug_check_no_obj_freed(object, c->objsize);
@@ -2633,7 +2659,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2633 2659
2634 if (!s || !text || !kmem_cache_open(s, flags, text, 2660 if (!s || !text || !kmem_cache_open(s, flags, text,
2635 realsize, ARCH_KMALLOC_MINALIGN, 2661 realsize, ARCH_KMALLOC_MINALIGN,
2636 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { 2662 SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED,
2663 NULL)) {
2637 kfree(s); 2664 kfree(s);
2638 kfree(text); 2665 kfree(text);
2639 goto unlock_out; 2666 goto unlock_out;
@@ -2727,9 +2754,10 @@ EXPORT_SYMBOL(__kmalloc);
2727 2754
2728static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2755static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2729{ 2756{
2730 struct page *page = alloc_pages_node(node, flags | __GFP_COMP, 2757 struct page *page;
2731 get_order(size));
2732 2758
2759 flags |= __GFP_COMP | __GFP_NOTRACK;
2760 page = alloc_pages_node(node, flags, get_order(size));
2733 if (page) 2761 if (page)
2734 return page_address(page); 2762 return page_address(page);
2735 else 2763 else
@@ -4412,6 +4440,8 @@ static char *create_unique_id(struct kmem_cache *s)
4412 *p++ = 'a'; 4440 *p++ = 'a';
4413 if (s->flags & SLAB_DEBUG_FREE) 4441 if (s->flags & SLAB_DEBUG_FREE)
4414 *p++ = 'F'; 4442 *p++ = 'F';
4443 if (!(s->flags & SLAB_NOTRACK))
4444 *p++ = 't';
4415 if (p != name + 1) 4445 if (p != name + 1)
4416 *p++ = '-'; 4446 *p++ = '-';
4417 p += sprintf(p, "%07d", s->size); 4447 p += sprintf(p, "%07d", s->size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1a94a3037370..5c93435b0347 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -39,6 +39,7 @@
39#include <linux/module.h> 39#include <linux/module.h>
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/kmemcheck.h>
42#include <linux/mm.h> 43#include <linux/mm.h>
43#include <linux/interrupt.h> 44#include <linux/interrupt.h>
44#include <linux/in.h> 45#include <linux/in.h>
@@ -201,6 +202,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
201 skb->data = data; 202 skb->data = data;
202 skb_reset_tail_pointer(skb); 203 skb_reset_tail_pointer(skb);
203 skb->end = skb->tail + size; 204 skb->end = skb->tail + size;
205 kmemcheck_annotate_bitfield(skb, flags1);
206 kmemcheck_annotate_bitfield(skb, flags2);
204 /* make sure we initialize shinfo sequentially */ 207 /* make sure we initialize shinfo sequentially */
205 shinfo = skb_shinfo(skb); 208 shinfo = skb_shinfo(skb);
206 atomic_set(&shinfo->dataref, 1); 209 atomic_set(&shinfo->dataref, 1);
@@ -217,6 +220,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
217 struct sk_buff *child = skb + 1; 220 struct sk_buff *child = skb + 1;
218 atomic_t *fclone_ref = (atomic_t *) (child + 1); 221 atomic_t *fclone_ref = (atomic_t *) (child + 1);
219 222
223 kmemcheck_annotate_bitfield(child, flags1);
224 kmemcheck_annotate_bitfield(child, flags2);
220 skb->fclone = SKB_FCLONE_ORIG; 225 skb->fclone = SKB_FCLONE_ORIG;
221 atomic_set(fclone_ref, 1); 226 atomic_set(fclone_ref, 1);
222 227
@@ -635,6 +640,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
635 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 640 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
636 if (!n) 641 if (!n)
637 return NULL; 642 return NULL;
643
644 kmemcheck_annotate_bitfield(n, flags1);
645 kmemcheck_annotate_bitfield(n, flags2);
638 n->fclone = SKB_FCLONE_UNAVAILABLE; 646 n->fclone = SKB_FCLONE_UNAVAILABLE;
639 } 647 }
640 648
diff --git a/net/core/sock.c b/net/core/sock.c
index 06e26b77ad9e..b0ba569bc973 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -945,6 +945,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
945 sk = kmalloc(prot->obj_size, priority); 945 sk = kmalloc(prot->obj_size, priority);
946 946
947 if (sk != NULL) { 947 if (sk != NULL) {
948 kmemcheck_annotate_bitfield(sk, flags);
949
948 if (security_sk_alloc(sk, family, priority)) 950 if (security_sk_alloc(sk, family, priority))
949 goto out_free; 951 goto out_free;
950 952
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 68a8d892c711..61283f928825 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h>
12#include <net/inet_hashtables.h> 13#include <net/inet_hashtables.h>
13#include <net/inet_timewait_sock.h> 14#include <net/inet_timewait_sock.h>
14#include <net/ip.h> 15#include <net/ip.h>
@@ -120,6 +121,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
120 if (tw != NULL) { 121 if (tw != NULL) {
121 const struct inet_sock *inet = inet_sk(sk); 122 const struct inet_sock *inet = inet_sk(sk);
122 123
124 kmemcheck_annotate_bitfield(tw, flags);
125
123 /* Give us an identity. */ 126 /* Give us an identity. */
124 tw->tw_daddr = inet->daddr; 127 tw->tw_daddr = inet->daddr;
125 tw->tw_rcv_saddr = inet->rcv_saddr; 128 tw->tw_rcv_saddr = inet->rcv_saddr;