diff options
author | Arjan van de Ven <arjan@linux.intel.com> | 2008-04-17 11:40:45 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-04-17 11:40:45 -0400 |
commit | 926e5392ba8a388ae32ca0d2714cc2c73945c609 (patch) | |
tree | 2718b50b8b66a3614f47d3246b080ee8511b299e | |
parent | 2596e0fae094be9354b29ddb17e6326a18012e8c (diff) |
x86: add code to dump the (kernel) page tables for visual inspection by kernel developers
This patch adds code to the kernel to have an (optional)
/proc/kernel_page_tables debug file that basically dumps the kernel
pagetables; this allows us kernel developers to verify that nothing fishy is
going on and that the various mappings are set up correctly. This was quite
useful in finding various change_page_attr() bugs, and is very likely to be
useful in the future as well.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: mingo@elte.hu
Cc: tglx@tglx.de
Cc: hpa@zytor.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r-- | arch/x86/Kconfig.debug | 12 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 301 |
3 files changed, 314 insertions, 0 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 702eb39901ca..cb7002eca887 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
@@ -54,6 +54,18 @@ config DEBUG_PER_CPU_MAPS | |||
54 | 54 | ||
55 | Say N if unsure. | 55 | Say N if unsure. |
56 | 56 | ||
57 | config X86_PTDUMP | ||
58 | bool "Export kernel pagetable layout to userspace via debugfs" | ||
59 | depends on X86_64 | ||
60 | select DEBUG_FS | ||
61 | help | ||
62 | Say Y here if you want to show the kernel pagetable layout in a | ||
63 | debugfs file. This information is only useful for kernel developers | ||
64 | who are working in architecture specific areas of the kernel. | ||
65 | It is probably not a good idea to enable this feature in a production | ||
66 | kernel. | ||
67 | If in doubt, say "N" | ||
68 | |||
57 | config DEBUG_RODATA | 69 | config DEBUG_RODATA |
58 | bool "Write protect kernel read-only data structures" | 70 | bool "Write protect kernel read-only data structures" |
59 | default y | 71 | default y |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8e81660604bc..28632f42ca66 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -12,4 +12,5 @@ else | |||
12 | obj-$(CONFIG_NUMA) += numa_64.o | 12 | obj-$(CONFIG_NUMA) += numa_64.o |
13 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 13 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o |
14 | obj-$(CONFIG_ACPI_NUMA) += srat_64.o | 14 | obj-$(CONFIG_ACPI_NUMA) += srat_64.o |
15 | obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o | ||
15 | endif | 16 | endif |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 000000000000..5e7f6430c27e --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -0,0 +1,301 @@ | |||
1 | /* | ||
2 | * Debug helper to dump the current kernel pagetables of the system | ||
3 | * so that we can see what the various memory ranges are set to. | ||
4 | * | ||
5 | * (C) Copyright 2008 Intel Corporation | ||
6 | * | ||
7 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; version 2 | ||
12 | * of the License. | ||
13 | */ | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/debugfs.h> | ||
18 | |||
19 | #include <asm/pgtable.h> | ||
20 | |||
21 | /* | ||
22 | * The dumper groups pagetable entries of the same type into one, and for | ||
23 | * that it needs to keep some state when walking, and flush this state | ||
24 | * when a "break" in the continuity is found. | ||
25 | */ | ||
26 | struct pg_state { | ||
27 | int level; | ||
28 | pgprot_t current_prot; | ||
29 | unsigned long start_address; | ||
30 | unsigned long current_address; | ||
31 | int printed_vmalloc; | ||
32 | int printed_modules; | ||
33 | int printed_vmemmap; | ||
34 | int printed_highmap; | ||
35 | }; | ||
36 | |||
37 | /* Multipliers for offsets within the PTEs */ | ||
38 | #define LEVEL_4_MULT (PAGE_SIZE) | ||
39 | #define LEVEL_3_MULT (512UL * LEVEL_4_MULT) | ||
40 | #define LEVEL_2_MULT (512UL * LEVEL_3_MULT) | ||
41 | #define LEVEL_1_MULT (512UL * LEVEL_2_MULT) | ||
42 | |||
43 | |||
44 | /* | ||
45 | * Print a readable form of a pgprot_t to the seq_file | ||
46 | */ | ||
47 | static void printk_prot(struct seq_file *m, pgprot_t prot, int level) | ||
48 | { | ||
49 | unsigned long pr = pgprot_val(prot); | ||
50 | |||
51 | if (pr & _PAGE_USER) | ||
52 | seq_printf(m, "USR "); | ||
53 | else | ||
54 | seq_printf(m, " "); | ||
55 | if (pr & _PAGE_RW) | ||
56 | seq_printf(m, "RW "); | ||
57 | else | ||
58 | seq_printf(m, "ro "); | ||
59 | if (pr & _PAGE_PWT) | ||
60 | seq_printf(m, "PWT "); | ||
61 | else | ||
62 | seq_printf(m, " "); | ||
63 | if (pr & _PAGE_PCD) | ||
64 | seq_printf(m, "PCD "); | ||
65 | else | ||
66 | seq_printf(m, " "); | ||
67 | |||
68 | /* Bit 9 has a different meaning on level 3 vs 4 */ | ||
69 | if (level <= 3) { | ||
70 | if (pr & _PAGE_PSE) | ||
71 | seq_printf(m, "PSE "); | ||
72 | else | ||
73 | seq_printf(m, " "); | ||
74 | } else { | ||
75 | if (pr & _PAGE_PAT) | ||
76 | seq_printf(m, "pat "); | ||
77 | else | ||
78 | seq_printf(m, " "); | ||
79 | } | ||
80 | if (pr & _PAGE_GLOBAL) | ||
81 | seq_printf(m, "GLB "); | ||
82 | else | ||
83 | seq_printf(m, " "); | ||
84 | if (pr & _PAGE_NX) | ||
85 | seq_printf(m, "NX "); | ||
86 | else | ||
87 | seq_printf(m, "x "); | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Sign-extend the 48 bit address to 64 bit | ||
92 | */ | ||
93 | static unsigned long sign_extend(unsigned long u) | ||
94 | { | ||
95 | if (u>>47) | ||
96 | u = u | (0xffffUL << 48); | ||
97 | return u; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * This function gets called on a break in a continuous series | ||
102 | * of PTE entries; the next one is different so we need to | ||
103 | * print what we collected so far. | ||
104 | */ | ||
105 | static void note_page(struct seq_file *m, struct pg_state *st, | ||
106 | pgprot_t new_prot, int level) | ||
107 | { | ||
108 | unsigned long prot, cur; | ||
109 | |||
110 | /* | ||
111 | * If we have a "break" in the series, we need to flush the state that | ||
112 | * we have now. "break" is either changing perms or a different level. | ||
113 | */ | ||
114 | prot = pgprot_val(new_prot) & ~(PTE_MASK); | ||
115 | cur = pgprot_val(st->current_prot) & ~(PTE_MASK); | ||
116 | |||
117 | if ((prot != cur || level != st->level) && | ||
118 | st->current_address != st->start_address) { | ||
119 | char unit = 'K'; | ||
120 | unsigned long delta; | ||
121 | |||
122 | /* | ||
123 | * We print markers for special areas of address space, | ||
124 | * such as the start of vmalloc space etc. | ||
125 | * This helps in the interpretation. | ||
126 | */ | ||
127 | if (!st->printed_vmalloc && | ||
128 | st->start_address >= VMALLOC_START) { | ||
129 | seq_printf(m, "---[ VMALLOC SPACE ]---\n"); | ||
130 | st->printed_vmalloc = 1; | ||
131 | } | ||
132 | if (!st->printed_modules && | ||
133 | st->start_address >= MODULES_VADDR) { | ||
134 | seq_printf(m, "---[ MODULES SPACE ]---\n"); | ||
135 | st->printed_modules = 1; | ||
136 | } | ||
137 | if (st->printed_modules < 2 && | ||
138 | st->start_address >= MODULES_END) { | ||
139 | seq_printf(m, "---[ END MODULES SPACE ]---\n"); | ||
140 | st->printed_modules = 2; | ||
141 | } | ||
142 | if (!st->printed_vmemmap && | ||
143 | st->start_address >= VMEMMAP_START) { | ||
144 | seq_printf(m, "---[ VMMEMMAP SPACE ]---\n"); | ||
145 | st->printed_vmemmap = 1; | ||
146 | } | ||
147 | if (!st->printed_highmap && | ||
148 | st->start_address >= __START_KERNEL_map) { | ||
149 | seq_printf(m, "---[ HIGH KERNEL MAPPING ]---\n"); | ||
150 | st->printed_highmap = 1; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Now print the actual finished series | ||
155 | */ | ||
156 | seq_printf(m, "[ %016lx - %016lx ", | ||
157 | st->start_address, st->current_address); | ||
158 | |||
159 | delta = (st->current_address - st->start_address) >> 10; | ||
160 | if ((delta & 1023) == 0) { | ||
161 | delta = delta >> 10; | ||
162 | unit = 'M'; | ||
163 | } | ||
164 | if (pgprot_val(st->current_prot)) { | ||
165 | seq_printf(m, "Size %9lu%cb ", delta, unit); | ||
166 | printk_prot(m, st->current_prot, st->level); | ||
167 | seq_printf(m, "L%i]\n", st->level); | ||
168 | } else { | ||
169 | /* don't print protections on non-present memory */ | ||
170 | seq_printf(m, "%14lu%cb", delta, unit); | ||
171 | seq_printf(m, " L%i]\n", | ||
172 | st->level); | ||
173 | } | ||
174 | st->start_address = st->current_address; | ||
175 | st->current_prot = new_prot; | ||
176 | st->level = level; | ||
177 | }; | ||
178 | } | ||
179 | |||
180 | static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr, | ||
181 | unsigned long P) | ||
182 | { | ||
183 | int i; | ||
184 | pte_t *start; | ||
185 | |||
186 | start = (pte_t *) pmd_page_vaddr(addr); | ||
187 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
188 | pgprot_t prot = pte_pgprot(*start); | ||
189 | |||
190 | st->current_address = sign_extend(P + i * LEVEL_4_MULT); | ||
191 | note_page(m, st, prot, 4); | ||
192 | start++; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | |||
197 | static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr, | ||
198 | unsigned long P) | ||
199 | { | ||
200 | int i; | ||
201 | pmd_t *start; | ||
202 | |||
203 | start = (pmd_t *) pud_page_vaddr(addr); | ||
204 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
205 | st->current_address = sign_extend(P + i * LEVEL_3_MULT); | ||
206 | if (!pmd_none(*start)) { | ||
207 | unsigned long prot; | ||
208 | |||
209 | prot = pmd_val(*start) & ~(PTE_MASK); | ||
210 | /* Deal with 2Mb pages */ | ||
211 | if (pmd_large(*start)) | ||
212 | note_page(m, st, __pgprot(prot), 3); | ||
213 | else | ||
214 | walk_level_4(m, st, *start, | ||
215 | P + i * LEVEL_3_MULT); | ||
216 | } else | ||
217 | note_page(m, st, __pgprot(0), 3); | ||
218 | start++; | ||
219 | } | ||
220 | } | ||
221 | |||
222 | |||
223 | static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr, | ||
224 | unsigned long P) | ||
225 | { | ||
226 | int i; | ||
227 | pud_t *start; | ||
228 | |||
229 | start = (pud_t *) pgd_page_vaddr(addr); | ||
230 | |||
231 | for (i = 0; i < PTRS_PER_PUD; i++) { | ||
232 | if (!pud_none(*start)) { | ||
233 | unsigned long prot; | ||
234 | |||
235 | prot = pud_val(*start) & ~(PTE_MASK); | ||
236 | /* Deal with 1Gb pages */ | ||
237 | if (pud_large(*start)) | ||
238 | note_page(m, st, __pgprot(prot), 2); | ||
239 | else | ||
240 | walk_level_3(m, st, *start, | ||
241 | P + i * LEVEL_2_MULT); | ||
242 | } else | ||
243 | note_page(m, st, __pgprot(0), 2); | ||
244 | |||
245 | start++; | ||
246 | } | ||
247 | } | ||
248 | |||
249 | static void walk_level_1(struct seq_file *m) | ||
250 | { | ||
251 | pgd_t *start = (pgd_t *) &init_level4_pgt; | ||
252 | int i; | ||
253 | struct pg_state st; | ||
254 | |||
255 | memset(&st, 0, sizeof(st)); | ||
256 | st.level = 1; | ||
257 | |||
258 | for (i = 0; i < PTRS_PER_PGD; i++) { | ||
259 | if (!pgd_none(*start)) | ||
260 | walk_level_2(m, &st, *start, i * LEVEL_1_MULT); | ||
261 | else | ||
262 | note_page(m, &st, __pgprot(0), 1); | ||
263 | start++; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | static int ptdump_show(struct seq_file *m, void *v) | ||
268 | { | ||
269 | seq_puts(m, "Kernel pagetable dump\n"); | ||
270 | walk_level_1(m); | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | static int ptdump_open(struct inode *inode, struct file *filp) | ||
275 | { | ||
276 | return single_open(filp, ptdump_show, NULL); | ||
277 | } | ||
278 | |||
279 | static const struct file_operations ptdump_fops = { | ||
280 | .open = ptdump_open, | ||
281 | .read = seq_read, | ||
282 | .llseek = seq_lseek, | ||
283 | .release = single_release, | ||
284 | }; | ||
285 | |||
286 | int pt_dump_init(void) | ||
287 | { | ||
288 | struct dentry *pe; | ||
289 | |||
290 | pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, | ||
291 | &ptdump_fops); | ||
292 | if (!pe) | ||
293 | return -ENOMEM; | ||
294 | |||
295 | return 0; | ||
296 | } | ||
297 | |||
298 | __initcall(pt_dump_init); | ||
299 | MODULE_LICENSE("GPL"); | ||
300 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | ||
301 | MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); | ||