From 28609f6e4921ebb75ceaf5317454b8047b07cef4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] i386: adjustments to page table dump during oops (v4) - make the page table contents printing PAE capable - make sure the address stored in current->thread.cr2 is unmodified from what was read from CR2 - don't call oops_may_print() multiple times, when one time suffices - print pte even in highpte case, as long as the pte page isn't in actually in high memory (which is specifically the case for all page tables covering kernel space) (Changes to v3: Use sizeof()*2 rather than the suggested sizeof()*4 for printing width, use fixed 16-nibble width for PAE, and also apply the max_low_pfn range check to the middle level lookup on PAE.) Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/fault.c | 55 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index b8c4e259fc8b..c6a0a06258e6 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -20,6 +20,7 @@ #include #include /* For unblank_screen() */ #include +#include /* for max_low_pfn */ #include #include #include @@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write, si_code; /* get the address */ @@ -510,7 +510,9 @@ no_context: bust_spinlocks(1); if (oops_may_print()) { - #ifdef CONFIG_X86_PAE + __typeof__(pte_val(__pte(0))) page; + +#ifdef CONFIG_X86_PAE if (error_code & 16) { pte_t *pte = lookup_address(address); @@ -519,7 +521,7 @@ no_context: "NX-protected page - exploit attempt? " "(uid: %d)\n", current->uid); } - #endif +#endif if (address < PAGE_SIZE) printk(KERN_ALERT "BUG: unable to handle kernel NULL " "pointer dereference"); @@ -529,25 +531,38 @@ no_context: printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); - } - page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> 22]; - if (oops_may_print()) + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk(KERN_ALERT "*pdpt = %016Lx\n", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_ALERT "*pde = %016Lx\n", page); + page &= ~_PAGE_NX; + } +#else printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if ((page & 1) && oops_may_print()) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); - } #endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); + } + } + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.2 From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86: __pa and __pa_symbol address space separation Currently __pa_symbol is for use with symbols in the kernel address map and __pa is for use with pointers into the physical memory map. But the code is implemented so you can usually interchange the two. __pa which is much more common can be implemented much more cheaply if it is it doesn't have to worry about any other kernel address spaces. This is especially true with a relocatable kernel as __pa_symbol needs to peform an extra variable read to resolve the address. There is a third macro that is added for the vsyscall data __pa_vsymbol for finding the physical addesses of vsyscall pages. Most of this patch is simply sorting through the references to __pa or __pa_symbol and using the proper one. A little of it is continuing to use a physical address when we have it instead of recalculating it several times. swapper_pgd is now NULL. leave_mm now uses init_mm.pgd and init_mm.pgd is initialized at boot (instead of compile time) to the physmem virtual mapping of init_level4_pgd. The physical address changed. Except for the for EMPTY_ZERO page all of the remaining references to __pa_symbol appear to be during kernel initialization. So this should reduce the cost of __pa in the common case, even on a relocated kernel. As this is technically a semantic change we need to be on the lookout for anything I missed. But it works for me (tm). Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/mm/init.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index ae436882af7a..23be1b0aafa4 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); @@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif -- cgit v1.2.2 From d01ad8dd56527be72947b4b9997bb2c05783c3ed Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: Improve handling of kernel mappings in change_page_attr Fix various broken corner cases in i386 and x86-64 change_page_attr. AK: split off from tighten kernel image access rights Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 412ebbd8adb0..ea6b6d4a0a2a 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot) return -EINVAL; kpte_page = virt_to_page(kpte); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); } else { pgprot_t ref_prot; @@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot) kpte_page = split; } page_private(kpte_page)++; - } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; -- cgit v1.2.2 From 6fb14755a676282a4e6caa05a08c92db8e45cfff Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: tighten kernel image page access rights On x86-64, kernel memory freed after init can be entirely unmapped instead of just getting 'poisoned' by overwriting with a debug pattern. On i386 and x86-64 (under CONFIG_DEBUG_RODATA), kernel text and bug table can also be write-protected. Compared to the first version, this one prevents re-creating deleted mappings in the kernel image range on x86-64, if those got removed previously. This, together with the original changes, prevents temporarily having inconsistent mappings when cacheability attributes are being changed on such pages (e.g. from AGP code). While on i386 such duplicate mappings don't exist, the same change is done there, too, both for consistency and because checking pte_present() before using various other pte_XXX functions is a requirement anyway. At once, i386 code gets adjusted to use pte_huge() instead of open coding this. AK: split out cpa() changes Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/init.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 23be1b0aafa4..bd5ef3718504 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -751,13 +752,25 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -781,7 +794,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) __free_page(page); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) -- cgit v1.2.2 From d4f7a2c18e59e0304a1c733589ce14fc02fec1bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] i386: Relocate VDSO ELF headers to match mapped location with COMPAT_VDSO Some versions of libc can't deal with a VDSO which doesn't have its ELF headers matching its mapped address. COMPAT_VDSO maps the VDSO at a specific system-wide fixed address. Previously this was all done at build time, on the grounds that the fixed VDSO address is always at the top of the address space. However, a hypervisor may reserve some of that address space, pushing the fixmap address down. This patch does the adjustment dynamically at runtime, depending on the runtime location of the VDSO fixmap. [ Patch has been through several hands: Jan Beulich wrote the orignal version; Zach reworked it, and Jeremy converted it to relocate phdrs as well as sections. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: "Jan Beulich" Cc: Eric W. Biederman Cc: Andi Kleen Cc: Ingo Molnar Cc: Roland McGrath --- arch/i386/mm/pgtable.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index fa0cfbd551e1..99c09edc3dbb 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) } static int fixmaps; -#ifndef CONFIG_COMPAT_VDSO unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -#endif void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { @@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve) BUG_ON(fixmaps > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); -#ifdef CONFIG_COMPAT_VDSO - BUG_ON(reserve != 0); -#else __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; -#endif } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -- cgit v1.2.2 From b239fb2501117bf3aeb4dd6926edd855be92333d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH] i386: PARAVIRT: Hooks to set up initial pagetable This patch introduces paravirt_ops hooks to control how the kernel's initial pagetable is set up. In the case of a native boot, the very early bootstrap code creates a simple non-PAE pagetable to map the kernel and physical memory. When the VM subsystem is initialized, it creates a proper pagetable which respects the PAE mode, large pages, etc. When booting under a hypervisor, there are many possibilities for what paging environment the hypervisor establishes for the guest kernel, so the constructon of the kernel's pagetable depends on the hypervisor. In the case of Xen, the hypervisor boots the kernel with a fully constructed pagetable, which is already using PAE if necessary. Also, Xen requires particular care when constructing pagetables to make sure all pagetables are always mapped read-only. In order to make this easier, kernel's initial pagetable construction has been changed to only allocate and initialize a pagetable page if there's no page already present in the pagetable. This allows the Xen paravirt backend to make a copy of the hypervisor-provided pagetable, allowing the kernel to establish any more mappings it needs while keeping the existing ones. A slightly subtle point which is worth highlighting here is that Xen requires all kernel mappings to share the same pte_t pages between all pagetables, so that updating a kernel page's mapping in one pagetable is reflected in all other pagetables. This makes it possible to allocate a page and attach it to a pagetable without having to explicitly enumerate that page's mapping in all pagetables. And: +From: "Eric W. Biederman" If we don't set the leaf page table entries it is quite possible that will inherit and incorrect page table entry from the initial boot page table setup in head.S. So we need to redo the effort here, so we pick up PSE, PGE and the like. Hypervisors like Xen require that their page tables be read-only, which is slightly incompatible with our low identity mappings, however I discussed this with Jeremy he has modified the Xen early set_pte function to avoid problems in this area. Signed-off-by: Eric W. Biederman Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: William Irwin Cc: Ingo Molnar --- arch/i386/mm/init.c | 138 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 47 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index bd5ef3718504..e8545dcf06c5 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -43,6 +43,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -62,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); + } +#endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); -#endif - return pmd_table; } @@ -82,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) */ static pte_t * __init one_page_table_init(pmd_t *pmd) { - if (pmd_none(*pmd)) { + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); @@ -109,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; - pud_t *pud; pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; @@ -120,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); + one_page_table_init(pmd); vaddr += PMD_SIZE; } @@ -168,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); else set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } } } @@ -338,24 +336,78 @@ extern void __init remap_numa_kva(void); #define remap_numa_kva() do {} while (0) #endif -static void __init pagetable_init (void) +void __init native_pagetable_setup_start(pgd_t *base) { - unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; - #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* + * Init entries of the first-level page table to the + * zero page, if they haven't already been set up. + * + * In a normal native boot, we'll be running on a + * pagetable rooted in swapper_pg_dir, but not in PAE + * mode, so this will end up clobbering the mappings + * for the lower 24Mbytes of the address space, + * without affecting the kernel address space. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) + set_pgd(&base[i], + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* Make sure kernel address space is empty so that a pagetable + will be allocated for it. */ + memset(&base[USER_PTRS_PER_PGD], 0, + KERNEL_PGD_PTRS * sizeof(pgd_t)); #else paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); #endif +} + +void __init native_pagetable_setup_done(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&base[0], base[USER_PTRS_PER_PGD]); +#endif +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/i386/kernel/head.S, and not running in PAE mode + * (even if we'll end up running in PAE). The root of the pagetable + * will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base = swapper_pg_dir; + + paravirt_pagetable_setup_start(pgd_base); /* Enable PSE if available */ - if (cpu_has_pse) { + if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); - } /* Enable PGE if available */ if (cpu_has_pge) { @@ -372,20 +424,12 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); -#endif + paravirt_pagetable_setup_done(pgd_base); } #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) -- cgit v1.2.2 From 5311ab62cdc7788784971ed816ce85e926f3e994 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing Normally when running in PAE mode, the 4th PMD maps the kernel address space, which can be shared among all processes (since they all need the same kernel mappings). Xen, however, does not allow guests to have the kernel pmd shared between page tables, so parameterize pgtable.c to allow both modes of operation. There are several side-effects of this. One is that vmalloc will update the kernel address space mappings, and those updates need to be propagated into all processes if the kernel mappings are not intrinsically shared. In the non-PAE case, this is done by maintaining a pgd_list of all processes; this list is used when all process pagetables must be updated. pgd_list is threaded via otherwise unused entries in the page structure for the pgd, which means that the pgd must be page-sized for this to work. Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE pgd to page aligned anyway, so this patch forces the pgd to be page aligned+sized when the kernel pmd is unshared, to accomodate both these requirements. Also, since there may be several distinct kernel pmds (if the user/kernel split is below 3G), there's no point in allocating them from a slab cache; they're just allocated with get_free_page and initialized appropriately. (Of course the could be cached if there is just a single kernel pmd - which is the default with a 3G user/kernel split - but it doesn't seem worthwhile to add yet another case into this code). [ Many thanks to wli for review comments. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: William Lee Irwin III Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: Christoph Lameter Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/mm/fault.c | 5 +-- arch/i386/mm/init.c | 18 ++++++++-- arch/i386/mm/pageattr.c | 2 +- arch/i386/mm/pgtable.c | 88 +++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 89 insertions(+), 24 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c6a0a06258e6..f534c29e80b2 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -603,7 +603,6 @@ do_sigbus: force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -#ifndef CONFIG_X86_PAE void vmalloc_sync_all(void) { /* @@ -616,6 +615,9 @@ void vmalloc_sync_all(void) static unsigned long start = TASK_SIZE; unsigned long address; + if (SHARED_KERNEL_PMD) + return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { @@ -638,4 +640,3 @@ void vmalloc_sync_all(void) start = address + PGDIR_SIZE; } } -#endif diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index e8545dcf06c5..dbe16f63a566 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -745,6 +745,8 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -754,13 +756,23 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (!SHARED_KERNEL_PMD) { + /* If we're in PAE mode and have a non-shared + kernel pmd, then the pgd size must be a + page size. This is because the pgd_list + links through the page structure, so there + can only be one pgd per page for this to + work. */ + pgd_size = PAGE_SIZE; + } } pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), + pgd_size, + pgd_size, 0, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index ea6b6d4a0a2a..47bd477c8ecc 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) unsigned long flags; set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) + if (SHARED_KERNEL_PMD) return; spin_lock_irqsave(&pgd_lock, flags); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 99c09edc3dbb..9a96c1647428 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -232,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd) set_page_private(next, (unsigned long)pprev); } +#if (PTRS_PER_PMD == 1) +/* Non-PAE pgd constructor */ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; - if (PTRS_PER_PMD == 1) { - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); - } + /* !PAE, no pagetable sharing */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + /* must happen under lock */ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - - if (PTRS_PER_PMD > 1) - return; - - /* must happen under lock */ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); - + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#else /* PTRS_PER_PMD > 1 */ +/* PAE pgd constructor */ +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +{ + /* PAE, kernel PMD may be shared */ + + if (SHARED_KERNEL_PMD) { + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } else { + unsigned long flags; + + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } +} +#endif /* PTRS_PER_PMD */ -/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + BUG_ON(SHARED_KERNEL_PMD); + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +/* If we allocate a pmd for part of the kernel address space, then + make sure its initialized with the appropriate kernel mappings. + Otherwise use a cached zeroed pmd. */ +static pmd_t *pmd_cache_alloc(int idx) +{ + pmd_t *pmd; + + if (idx >= USER_PTRS_PER_PGD) { + pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + + if (pmd) + memcpy(pmd, + (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + sizeof(pmd_t) * PTRS_PER_PMD); + } else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + + return pmd; +} + +static void pmd_cache_free(pmd_t *pmd, int idx) +{ + if (idx >= USER_PTRS_PER_PGD) + free_page((unsigned long)pmd); + else + kmem_cache_free(pmd_cache, pmd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; @@ -276,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { + pmd_t *pmd = pmd_cache_alloc(i); + if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } @@ -290,7 +342,7 @@ out_oom: pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } kmem_cache_free(pgd_cache, pgd); return NULL; @@ -302,11 +354,11 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); -- cgit v1.2.2 From ce6234b5298902aaec831a67d5f8d9bd2ef5a488 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH] i386: PARAVIRT: add kmap_atomic_pte for mapping highpte pages Xen and VMI both have special requirements when mapping a highmem pte page into the kernel address space. These can be dealt with by adding a new kmap_atomic_pte() function for mapping highptes, and hooking it into the paravirt_ops infrastructure. Xen specifically wants to map the pte page RO, so this patch exposes a helper function, kmap_atomic_prot, which maps the page with the specified page protections. This also adds a kmap_flush_unused() function to clear out the cached kmap mappings. Xen needs this to clear out any potential stray RW mappings of pages which will become part of a pagetable. [ Zach - vmi.c will need some attention after this patch. It wasn't immediately obvious to me what needs to be done. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden --- arch/i386/mm/highmem.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index ac70d09df7ee..a1a21abf742e 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -26,7 +26,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type) return page_address(page); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void*) vaddr; } +void *kmap_atomic(struct page *page, enum km_type type) +{ + return kmap_atomic_prot(page, type, kmap_prot); +} + void kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; -- cgit v1.2.2 From 7b2f27f4e1818fad980da7bea688dca2b9e9c3f3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH] i386: PARAVIRT: flush lazy mmu updates on kunmap_atomic kunmap_atomic should flush any pending lazy mmu updates, mainly to be consistent with kmap_atomic, and to preserve its normal behaviour. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/mm/highmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386/mm') diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index a1a21abf742e..ad8d86cc683e 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -72,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } + arch_flush_lazy_mmu_mode(); pagefault_enable(); } -- cgit v1.2.2