aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@goop.org>2007-05-02 13:27:13 -0400
committerAndi Kleen <andi@basil.nowhere.org>2007-05-02 13:27:13 -0400
commitb239fb2501117bf3aeb4dd6926edd855be92333d (patch)
tree62ac25204632ef0b14e3bd84580c722e69800cf7
parent3dc494e86d1c93afd4c66385f270899dbfae483d (diff)
[PATCH] i386: PARAVIRT: Hooks to set up initial pagetable
This patch introduces paravirt_ops hooks to control how the kernel's initial pagetable is set up. In the case of a native boot, the very early bootstrap code creates a simple non-PAE pagetable to map the kernel and physical memory. When the VM subsystem is initialized, it creates a proper pagetable which respects the PAE mode, large pages, etc. When booting under a hypervisor, there are many possibilities for what paging environment the hypervisor establishes for the guest kernel, so the constructon of the kernel's pagetable depends on the hypervisor. In the case of Xen, the hypervisor boots the kernel with a fully constructed pagetable, which is already using PAE if necessary. Also, Xen requires particular care when constructing pagetables to make sure all pagetables are always mapped read-only. In order to make this easier, kernel's initial pagetable construction has been changed to only allocate and initialize a pagetable page if there's no page already present in the pagetable. This allows the Xen paravirt backend to make a copy of the hypervisor-provided pagetable, allowing the kernel to establish any more mappings it needs while keeping the existing ones. A slightly subtle point which is worth highlighting here is that Xen requires all kernel mappings to share the same pte_t pages between all pagetables, so that updating a kernel page's mapping in one pagetable is reflected in all other pagetables. This makes it possible to allocate a page and attach it to a pagetable without having to explicitly enumerate that page's mapping in all pagetables. And: +From: "Eric W. Biederman" <ebiederm@xmission.com> If we don't set the leaf page table entries it is quite possible that will inherit and incorrect page table entry from the initial boot page table setup in head.S. So we need to redo the effort here, so we pick up PSE, PGE and the like. Hypervisors like Xen require that their page tables be read-only, which is slightly incompatible with our low identity mappings, however I discussed this with Jeremy he has modified the Xen early set_pte function to avoid problems in this area. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Andi Kleen <ak@suse.de> Acked-by: William Irwin <bill.irwin@oracle.com> Cc: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/i386/kernel/paravirt.c3
-rw-r--r--arch/i386/mm/init.c138
-rw-r--r--include/asm-i386/paravirt.h17
-rw-r--r--include/asm-i386/pgtable.h16
4 files changed, 126 insertions, 48 deletions
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index cba7a15ce1b0..47d075bdfb95 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -193,6 +193,9 @@ struct paravirt_ops paravirt_ops = {
193#endif 193#endif
194 .set_lazy_mode = paravirt_nop, 194 .set_lazy_mode = paravirt_nop,
195 195
196 .pagetable_setup_start = native_pagetable_setup_start,
197 .pagetable_setup_done = native_pagetable_setup_done,
198
196 .flush_tlb_user = native_flush_tlb, 199 .flush_tlb_user = native_flush_tlb,
197 .flush_tlb_kernel = native_flush_tlb_global, 200 .flush_tlb_kernel = native_flush_tlb_global,
198 .flush_tlb_single = native_flush_tlb_single, 201 .flush_tlb_single = native_flush_tlb_single,
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index bd5ef3718504..e8545dcf06c5 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -43,6 +43,7 @@
43#include <asm/tlb.h> 43#include <asm/tlb.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/sections.h> 45#include <asm/sections.h>
46#include <asm/paravirt.h>
46 47
47unsigned int __VMALLOC_RESERVE = 128 << 20; 48unsigned int __VMALLOC_RESERVE = 128 << 20;
48 49
@@ -62,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
62 pmd_t *pmd_table; 63 pmd_t *pmd_table;
63 64
64#ifdef CONFIG_X86_PAE 65#ifdef CONFIG_X86_PAE
65 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
66 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); 67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
67 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 68
68 pud = pud_offset(pgd, 0); 69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
69 if (pmd_table != pmd_offset(pud, 0)) 70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
70 BUG(); 71 pud = pud_offset(pgd, 0);
71#else 72 if (pmd_table != pmd_offset(pud, 0))
73 BUG();
74 }
75#endif
72 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
73 pmd_table = pmd_offset(pud, 0); 77 pmd_table = pmd_offset(pud, 0);
74#endif
75
76 return pmd_table; 78 return pmd_table;
77} 79}
78 80
@@ -82,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
82 */ 84 */
83static pte_t * __init one_page_table_init(pmd_t *pmd) 85static pte_t * __init one_page_table_init(pmd_t *pmd)
84{ 86{
85 if (pmd_none(*pmd)) { 87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
86 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89
87 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); 90 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
88 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
89 if (page_table != pte_offset_kernel(pmd, 0)) 92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
90 BUG();
91
92 return page_table;
93 } 93 }
94 94
95 return pte_offset_kernel(pmd, 0); 95 return pte_offset_kernel(pmd, 0);
@@ -109,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) 109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
110{ 110{
111 pgd_t *pgd; 111 pgd_t *pgd;
112 pud_t *pud;
113 pmd_t *pmd; 112 pmd_t *pmd;
114 int pgd_idx, pmd_idx; 113 int pgd_idx, pmd_idx;
115 unsigned long vaddr; 114 unsigned long vaddr;
@@ -120,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
120 pgd = pgd_base + pgd_idx; 119 pgd = pgd_base + pgd_idx;
121 120
122 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
123 if (pgd_none(*pgd)) 122 pmd = one_md_table_init(pgd);
124 one_md_table_init(pgd); 123 pmd = pmd + pmd_index(vaddr);
125 pud = pud_offset(pgd, vaddr);
126 pmd = pmd_offset(pud, vaddr);
127 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { 124 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
128 if (pmd_none(*pmd)) 125 one_page_table_init(pmd);
129 one_page_table_init(pmd);
130 126
131 vaddr += PMD_SIZE; 127 vaddr += PMD_SIZE;
132 } 128 }
@@ -168,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
168 /* Map with big pages if possible, otherwise create normal page tables. */ 164 /* Map with big pages if possible, otherwise create normal page tables. */
169 if (cpu_has_pse) { 165 if (cpu_has_pse) {
170 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; 166 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
171
172 if (is_kernel_text(address) || is_kernel_text(address2)) 167 if (is_kernel_text(address) || is_kernel_text(address2))
173 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); 168 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
174 else 169 else
175 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); 170 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
171
176 pfn += PTRS_PER_PTE; 172 pfn += PTRS_PER_PTE;
177 } else { 173 } else {
178 pte = one_page_table_init(pmd); 174 pte = one_page_table_init(pmd);
179 175
180 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { 176 for (pte_ofs = 0;
181 if (is_kernel_text(address)) 177 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); 178 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
183 else 179 if (is_kernel_text(address))
184 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); 180 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
181 else
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
185 } 183 }
186 } 184 }
187 } 185 }
@@ -338,24 +336,78 @@ extern void __init remap_numa_kva(void);
338#define remap_numa_kva() do {} while (0) 336#define remap_numa_kva() do {} while (0)
339#endif 337#endif
340 338
341static void __init pagetable_init (void) 339void __init native_pagetable_setup_start(pgd_t *base)
342{ 340{
343 unsigned long vaddr;
344 pgd_t *pgd_base = swapper_pg_dir;
345
346#ifdef CONFIG_X86_PAE 341#ifdef CONFIG_X86_PAE
347 int i; 342 int i;
348 /* Init entries of the first-level page table to the zero page */ 343
349 for (i = 0; i < PTRS_PER_PGD; i++) 344 /*
350 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 345 * Init entries of the first-level page table to the
346 * zero page, if they haven't already been set up.
347 *
348 * In a normal native boot, we'll be running on a
349 * pagetable rooted in swapper_pg_dir, but not in PAE
350 * mode, so this will end up clobbering the mappings
351 * for the lower 24Mbytes of the address space,
352 * without affecting the kernel address space.
353 */
354 for (i = 0; i < USER_PTRS_PER_PGD; i++)
355 set_pgd(&base[i],
356 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
357
358 /* Make sure kernel address space is empty so that a pagetable
359 will be allocated for it. */
360 memset(&base[USER_PTRS_PER_PGD], 0,
361 KERNEL_PGD_PTRS * sizeof(pgd_t));
351#else 362#else
352 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); 363 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
353#endif 364#endif
365}
366
367void __init native_pagetable_setup_done(pgd_t *base)
368{
369#ifdef CONFIG_X86_PAE
370 /*
371 * Add low memory identity-mappings - SMP needs it when
372 * starting up on an AP from real-mode. In the non-PAE
373 * case we already have these mappings through head.S.
374 * All user-space mappings are explicitly cleared after
375 * SMP startup.
376 */
377 set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
378#endif
379}
380
381/*
382 * Build a proper pagetable for the kernel mappings. Up until this
383 * point, we've been running on some set of pagetables constructed by
384 * the boot process.
385 *
386 * If we're booting on native hardware, this will be a pagetable
387 * constructed in arch/i386/kernel/head.S, and not running in PAE mode
388 * (even if we'll end up running in PAE). The root of the pagetable
389 * will be swapper_pg_dir.
390 *
391 * If we're booting paravirtualized under a hypervisor, then there are
392 * more options: we may already be running PAE, and the pagetable may
393 * or may not be based in swapper_pg_dir. In any case,
394 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
395 * appropriately for the rest of the initialization to work.
396 *
397 * In general, pagetable_init() assumes that the pagetable may already
398 * be partially populated, and so it avoids stomping on any existing
399 * mappings.
400 */
401static void __init pagetable_init (void)
402{
403 unsigned long vaddr, end;
404 pgd_t *pgd_base = swapper_pg_dir;
405
406 paravirt_pagetable_setup_start(pgd_base);
354 407
355 /* Enable PSE if available */ 408 /* Enable PSE if available */
356 if (cpu_has_pse) { 409 if (cpu_has_pse)
357 set_in_cr4(X86_CR4_PSE); 410 set_in_cr4(X86_CR4_PSE);
358 }
359 411
360 /* Enable PGE if available */ 412 /* Enable PGE if available */
361 if (cpu_has_pge) { 413 if (cpu_has_pge) {
@@ -372,20 +424,12 @@ static void __init pagetable_init (void)
372 * created - mappings will be set by set_fixmap(): 424 * created - mappings will be set by set_fixmap():
373 */ 425 */
374 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 426 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
375 page_table_range_init(vaddr, 0, pgd_base); 427 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
428 page_table_range_init(vaddr, end, pgd_base);
376 429
377 permanent_kmaps_init(pgd_base); 430 permanent_kmaps_init(pgd_base);
378 431
379#ifdef CONFIG_X86_PAE 432 paravirt_pagetable_setup_done(pgd_base);
380 /*
381 * Add low memory identity-mappings - SMP needs it when
382 * starting up on an AP from real-mode. In the non-PAE
383 * case we already have these mappings through head.S.
384 * All user-space mappings are explicitly cleared after
385 * SMP startup.
386 */
387 set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
388#endif
389} 433}
390 434
391#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) 435#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 0aacb13bb929..c49b44cdd8ee 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -2,10 +2,11 @@
2#define __ASM_PARAVIRT_H 2#define __ASM_PARAVIRT_H
3/* Various instructions on x86 need to be replaced for 3/* Various instructions on x86 need to be replaced for
4 * para-virtualization: those hooks are defined here. */ 4 * para-virtualization: those hooks are defined here. */
5
6#ifdef CONFIG_PARAVIRT
5#include <linux/stringify.h> 7#include <linux/stringify.h>
6#include <asm/page.h> 8#include <asm/page.h>
7 9
8#ifdef CONFIG_PARAVIRT
9/* These are the most performance critical ops, so we want to be able to patch 10/* These are the most performance critical ops, so we want to be able to patch
10 * callers */ 11 * callers */
11#define PARAVIRT_IRQ_DISABLE 0 12#define PARAVIRT_IRQ_DISABLE 0
@@ -50,6 +51,9 @@ struct paravirt_ops
50 char *(*memory_setup)(void); 51 char *(*memory_setup)(void);
51 void (*init_IRQ)(void); 52 void (*init_IRQ)(void);
52 53
54 void (*pagetable_setup_start)(pgd_t *pgd_base);
55 void (*pagetable_setup_done)(pgd_t *pgd_base);
56
53 void (*banner)(void); 57 void (*banner)(void);
54 58
55 unsigned long (*get_wallclock)(void); 59 unsigned long (*get_wallclock)(void);
@@ -370,6 +374,17 @@ static inline void setup_secondary_clock(void)
370} 374}
371#endif 375#endif
372 376
377static inline void paravirt_pagetable_setup_start(pgd_t *base)
378{
379 if (paravirt_ops.pagetable_setup_start)
380 (*paravirt_ops.pagetable_setup_start)(base);
381}
382
383static inline void paravirt_pagetable_setup_done(pgd_t *base)
384{
385 if (paravirt_ops.pagetable_setup_done)
386 (*paravirt_ops.pagetable_setup_done)(base);
387}
373 388
374#ifdef CONFIG_SMP 389#ifdef CONFIG_SMP
375static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, 390static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 147f2553784d..0790ad6ed440 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -514,6 +514,22 @@ do { \
514 * tables contain all the necessary information. 514 * tables contain all the necessary information.
515 */ 515 */
516#define update_mmu_cache(vma,address,pte) do { } while (0) 516#define update_mmu_cache(vma,address,pte) do { } while (0)
517
518void native_pagetable_setup_start(pgd_t *base);
519void native_pagetable_setup_done(pgd_t *base);
520
521#ifndef CONFIG_PARAVIRT
522static inline void paravirt_pagetable_setup_start(pgd_t *base)
523{
524 native_pagetable_setup_start(base);
525}
526
527static inline void paravirt_pagetable_setup_done(pgd_t *base)
528{
529 native_pagetable_setup_done(base);
530}
531#endif /* !CONFIG_PARAVIRT */
532
517#endif /* !__ASSEMBLY__ */ 533#endif /* !__ASSEMBLY__ */
518 534
519#ifdef CONFIG_FLATMEM 535#ifdef CONFIG_FLATMEM