diff options
author | Steve French <sfrench@us.ibm.com> | 2008-04-25 16:20:10 -0400 |
---|---|---|
committer | Steve French <sfrench@us.ibm.com> | 2008-04-25 16:20:10 -0400 |
commit | 404e86e1550cc2c84bb57a372af784585c732f9a (patch) | |
tree | c0e8e2d61c1b1c79705c0dc9f0f16e35267286e4 /arch/x86 | |
parent | 0206e61b467fde4d7b50f1a64355182a4fd9576b (diff) | |
parent | b9fa38f75ea7e1f64bc29653ca9758303ce698e4 (diff) |
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/x86')
29 files changed, 840 insertions, 1027 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 610aaecc19f8..239fd9fba0a5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
@@ -5,6 +5,17 @@ config TRACE_IRQFLAGS_SUPPORT | |||
5 | 5 | ||
6 | source "lib/Kconfig.debug" | 6 | source "lib/Kconfig.debug" |
7 | 7 | ||
8 | config NONPROMISC_DEVMEM | ||
9 | bool "Disable promiscuous /dev/mem" | ||
10 | help | ||
11 | The /dev/mem file by default only allows userspace access to PCI | ||
12 | space and the BIOS code and data regions. This is sufficient for | ||
13 | dosemu and X and all common users of /dev/mem. With this config | ||
14 | option, you allow userspace access to all of memory, including | ||
15 | kernel and userspace memory. Accidental access to this is | ||
16 | obviously disasterous, but specific access can be used by people | ||
17 | debugging the kernel. | ||
18 | |||
8 | config EARLY_PRINTK | 19 | config EARLY_PRINTK |
9 | bool "Early printk" if EMBEDDED | 20 | bool "Early printk" if EMBEDDED |
10 | default y | 21 | default y |
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index b1bdc4c6f9f2..172cf8a98bdd 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore | |||
@@ -1,7 +1,8 @@ | |||
1 | bootsect | 1 | bootsect |
2 | bzImage | 2 | bzImage |
3 | cpustr.h | ||
4 | mkcpustr | ||
5 | offsets.h | ||
3 | setup | 6 | setup |
4 | setup.bin | 7 | setup.bin |
5 | setup.elf | 8 | setup.elf |
6 | cpustr.h | ||
7 | mkcpustr | ||
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore new file mode 100644 index 000000000000..58f1f48a58f8 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/.gitignore | |||
@@ -0,0 +1,3 @@ | |||
1 | wakeup.bin | ||
2 | wakeup.elf | ||
3 | wakeup.lds | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index df4099dc1c68..65c7857a90dd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -511,31 +511,30 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) | |||
511 | unsigned long flags; | 511 | unsigned long flags; |
512 | char *vaddr; | 512 | char *vaddr; |
513 | int nr_pages = 2; | 513 | int nr_pages = 2; |
514 | struct page *pages[2]; | ||
515 | int i; | ||
514 | 516 | ||
515 | BUG_ON(len > sizeof(long)); | 517 | if (!core_kernel_text((unsigned long)addr)) { |
516 | BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1)) | 518 | pages[0] = vmalloc_to_page(addr); |
517 | - ((long)addr & ~(sizeof(long) - 1))); | 519 | pages[1] = vmalloc_to_page(addr + PAGE_SIZE); |
518 | if (kernel_text_address((unsigned long)addr)) { | ||
519 | struct page *pages[2] = { virt_to_page(addr), | ||
520 | virt_to_page(addr + PAGE_SIZE) }; | ||
521 | if (!pages[1]) | ||
522 | nr_pages = 1; | ||
523 | vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); | ||
524 | BUG_ON(!vaddr); | ||
525 | local_irq_save(flags); | ||
526 | memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); | ||
527 | local_irq_restore(flags); | ||
528 | vunmap(vaddr); | ||
529 | } else { | 520 | } else { |
530 | /* | 521 | pages[0] = virt_to_page(addr); |
531 | * modules are in vmalloc'ed memory, always writable. | 522 | WARN_ON(!PageReserved(pages[0])); |
532 | */ | 523 | pages[1] = virt_to_page(addr + PAGE_SIZE); |
533 | local_irq_save(flags); | ||
534 | memcpy(addr, opcode, len); | ||
535 | local_irq_restore(flags); | ||
536 | } | 524 | } |
525 | BUG_ON(!pages[0]); | ||
526 | if (!pages[1]) | ||
527 | nr_pages = 1; | ||
528 | vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); | ||
529 | BUG_ON(!vaddr); | ||
530 | local_irq_save(flags); | ||
531 | memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); | ||
532 | local_irq_restore(flags); | ||
533 | vunmap(vaddr); | ||
537 | sync_core(); | 534 | sync_core(); |
538 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | 535 | /* Could also do a CLFLUSH here to speed up CPU recovery; but |
539 | that causes hangs on some VIA CPUs. */ | 536 | that causes hangs on some VIA CPUs. */ |
537 | for (i = 0; i < len; i++) | ||
538 | BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); | ||
540 | return addr; | 539 | return addr; |
541 | } | 540 | } |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0f8934fc303..2a609dc3271c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -409,7 +409,7 @@ restore_nocheck_notrace: | |||
409 | irq_return: | 409 | irq_return: |
410 | INTERRUPT_RETURN | 410 | INTERRUPT_RETURN |
411 | .section .fixup,"ax" | 411 | .section .fixup,"ax" |
412 | iret_exc: | 412 | ENTRY(iret_exc) |
413 | pushl $0 # no error code | 413 | pushl $0 # no error code |
414 | pushl $do_iret_error | 414 | pushl $do_iret_error |
415 | jmp error_code | 415 | jmp error_code |
@@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper) | |||
1017 | ENDPROC(kernel_thread_helper) | 1017 | ENDPROC(kernel_thread_helper) |
1018 | 1018 | ||
1019 | #ifdef CONFIG_XEN | 1019 | #ifdef CONFIG_XEN |
1020 | /* Xen doesn't set %esp to be precisely what the normal sysenter | ||
1021 | entrypoint expects, so fix it up before using the normal path. */ | ||
1022 | ENTRY(xen_sysenter_target) | ||
1023 | RING0_INT_FRAME | ||
1024 | addl $5*4, %esp /* remove xen-provided frame */ | ||
1025 | jmp sysenter_past_esp | ||
1026 | |||
1020 | ENTRY(xen_hypervisor_callback) | 1027 | ENTRY(xen_hypervisor_callback) |
1021 | CFI_STARTPROC | 1028 | CFI_STARTPROC |
1022 | pushl $0 | 1029 | pushl $0 |
@@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback) | |||
1035 | cmpl $xen_iret_end_crit,%eax | 1042 | cmpl $xen_iret_end_crit,%eax |
1036 | jae 1f | 1043 | jae 1f |
1037 | 1044 | ||
1038 | call xen_iret_crit_fixup | 1045 | jmp xen_iret_crit_fixup |
1039 | 1046 | ||
1047 | ENTRY(xen_do_upcall) | ||
1040 | 1: mov %esp, %eax | 1048 | 1: mov %esp, %eax |
1041 | call xen_evtchn_do_upcall | 1049 | call xen_evtchn_do_upcall |
1042 | jmp ret_from_intr | 1050 | jmp ret_from_intr |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3733412d1357..74f0c5ea2a03 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
366 | .flush_tlb_single = native_flush_tlb_single, | 366 | .flush_tlb_single = native_flush_tlb_single, |
367 | .flush_tlb_others = native_flush_tlb_others, | 367 | .flush_tlb_others = native_flush_tlb_others, |
368 | 368 | ||
369 | .alloc_pt = paravirt_nop, | 369 | .alloc_pte = paravirt_nop, |
370 | .alloc_pd = paravirt_nop, | 370 | .alloc_pmd = paravirt_nop, |
371 | .alloc_pd_clone = paravirt_nop, | 371 | .alloc_pmd_clone = paravirt_nop, |
372 | .release_pt = paravirt_nop, | 372 | .alloc_pud = paravirt_nop, |
373 | .release_pd = paravirt_nop, | 373 | .release_pte = paravirt_nop, |
374 | .release_pmd = paravirt_nop, | ||
375 | .release_pud = paravirt_nop, | ||
374 | 376 | ||
375 | .set_pte = native_set_pte, | 377 | .set_pte = native_set_pte, |
376 | .set_pte_at = native_set_pte_at, | 378 | .set_pte_at = native_set_pte_at, |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 19c9386ac118..1791a751a772 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <asm/apic.h> | 8 | #include <asm/apic.h> |
9 | #include <asm/desc.h> | 9 | #include <asm/desc.h> |
10 | #include <asm/hpet.h> | 10 | #include <asm/hpet.h> |
11 | #include <asm/pgtable.h> | ||
11 | #include <asm/reboot_fixups.h> | 12 | #include <asm/reboot_fixups.h> |
12 | #include <asm/reboot.h> | 13 | #include <asm/reboot.h> |
13 | 14 | ||
@@ -15,7 +16,6 @@ | |||
15 | # include <linux/dmi.h> | 16 | # include <linux/dmi.h> |
16 | # include <linux/ctype.h> | 17 | # include <linux/ctype.h> |
17 | # include <linux/mc146818rtc.h> | 18 | # include <linux/mc146818rtc.h> |
18 | # include <asm/pgtable.h> | ||
19 | #else | 19 | #else |
20 | # include <asm/iommu.h> | 20 | # include <asm/iommu.h> |
21 | #endif | 21 | #endif |
@@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length) | |||
275 | /* Remap the kernel at virtual address zero, as well as offset zero | 275 | /* Remap the kernel at virtual address zero, as well as offset zero |
276 | from the kernel segment. This assumes the kernel segment starts at | 276 | from the kernel segment. This assumes the kernel segment starts at |
277 | virtual address PAGE_OFFSET. */ | 277 | virtual address PAGE_OFFSET. */ |
278 | memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, | 278 | memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
279 | sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); | 279 | sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); |
280 | 280 | ||
281 | /* | 281 | /* |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ade371f9663a..eef79e84145f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
1039 | 1039 | ||
1040 | #ifdef CONFIG_X86_32 | 1040 | #ifdef CONFIG_X86_32 |
1041 | /* init low mem mapping */ | 1041 | /* init low mem mapping */ |
1042 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, | 1042 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
1043 | min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); | 1043 | min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); |
1044 | flush_tlb_all(); | 1044 | flush_tlb_all(); |
1045 | #endif | 1045 | #endif |
1046 | 1046 | ||
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 12affe1f9bce..956f38927aa7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page) | |||
320 | * pdes need to be zeroed. | 320 | * pdes need to be zeroed. |
321 | */ | 321 | */ |
322 | if (type & VMI_PAGE_CLONE) | 322 | if (type & VMI_PAGE_CLONE) |
323 | limit = USER_PTRS_PER_PGD; | 323 | limit = KERNEL_PGD_BOUNDARY; |
324 | for (i = 0; i < limit; i++) | 324 | for (i = 0; i < limit; i++) |
325 | BUG_ON(ptr[i]); | 325 | BUG_ON(ptr[i]); |
326 | } | 326 | } |
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) | |||
392 | } | 392 | } |
393 | #endif | 393 | #endif |
394 | 394 | ||
395 | static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) | 395 | static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) |
396 | { | 396 | { |
397 | vmi_set_page_type(pfn, VMI_PAGE_L1); | 397 | vmi_set_page_type(pfn, VMI_PAGE_L1); |
398 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); | 398 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); |
399 | } | 399 | } |
400 | 400 | ||
401 | static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) | 401 | static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) |
402 | { | 402 | { |
403 | /* | 403 | /* |
404 | * This call comes in very early, before mem_map is setup. | 404 | * This call comes in very early, before mem_map is setup. |
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) | |||
409 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); | 409 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); |
410 | } | 410 | } |
411 | 411 | ||
412 | static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) | 412 | static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) |
413 | { | 413 | { |
414 | vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); | 414 | vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); |
415 | vmi_check_page_type(clonepfn, VMI_PAGE_L2); | 415 | vmi_check_page_type(clonepfn, VMI_PAGE_L2); |
416 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); | 416 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); |
417 | } | 417 | } |
418 | 418 | ||
419 | static void vmi_release_pt(u32 pfn) | 419 | static void vmi_release_pte(u32 pfn) |
420 | { | 420 | { |
421 | vmi_ops.release_page(pfn, VMI_PAGE_L1); | 421 | vmi_ops.release_page(pfn, VMI_PAGE_L1); |
422 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | 422 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); |
423 | } | 423 | } |
424 | 424 | ||
425 | static void vmi_release_pd(u32 pfn) | 425 | static void vmi_release_pmd(u32 pfn) |
426 | { | 426 | { |
427 | vmi_ops.release_page(pfn, VMI_PAGE_L2); | 427 | vmi_ops.release_page(pfn, VMI_PAGE_L2); |
428 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | 428 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); |
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void) | |||
871 | 871 | ||
872 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); | 872 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); |
873 | if (vmi_ops.allocate_page) { | 873 | if (vmi_ops.allocate_page) { |
874 | pv_mmu_ops.alloc_pt = vmi_allocate_pt; | 874 | pv_mmu_ops.alloc_pte = vmi_allocate_pte; |
875 | pv_mmu_ops.alloc_pd = vmi_allocate_pd; | 875 | pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; |
876 | pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; | 876 | pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; |
877 | } | 877 | } |
878 | 878 | ||
879 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); | 879 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); |
880 | if (vmi_ops.release_page) { | 880 | if (vmi_ops.release_page) { |
881 | pv_mmu_ops.release_pt = vmi_release_pt; | 881 | pv_mmu_ops.release_pte = vmi_release_pte; |
882 | pv_mmu_ops.release_pd = vmi_release_pd; | 882 | pv_mmu_ops.release_pmd = vmi_release_pmd; |
883 | } | 883 | } |
884 | 884 | ||
885 | /* Set linear is needed in all cases */ | 885 | /* Set linear is needed in all cases */ |
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index d05722121d24..6e2c4efce0ef 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c | |||
@@ -543,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu) | |||
543 | hijack_source.idt.Offset, stack_start.sp)); | 543 | hijack_source.idt.Offset, stack_start.sp)); |
544 | 544 | ||
545 | /* init lowmem identity mapping */ | 545 | /* init lowmem identity mapping */ |
546 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, | 546 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
547 | min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); | 547 | min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); |
548 | flush_tlb_all(); | 548 | flush_tlb_all(); |
549 | 549 | ||
550 | if (quad_boot) { | 550 | if (quad_boot) { |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 20941d2954e2..b7b3e4c7cfc9 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
2 | pat.o | 2 | pat.o pgtable.o |
3 | 3 | ||
4 | obj-$(CONFIG_X86_32) += pgtable_32.o | 4 | obj-$(CONFIG_X86_32) += pgtable_32.o |
5 | 5 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ec62da85fd7..baf7c4f643c8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
71 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | 71 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { |
72 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | 72 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); |
73 | 73 | ||
74 | paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); | 74 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); |
75 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 75 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
76 | pud = pud_offset(pgd, 0); | 76 | pud = pud_offset(pgd, 0); |
77 | BUG_ON(pmd_table != pmd_offset(pud, 0)); | 77 | BUG_ON(pmd_table != pmd_offset(pud, 0)); |
@@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
100 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 100 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); |
101 | } | 101 | } |
102 | 102 | ||
103 | paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); | 103 | paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); |
104 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | 104 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); |
105 | BUG_ON(page_table != pte_offset_kernel(pmd, 0)); | 105 | BUG_ON(page_table != pte_offset_kernel(pmd, 0)); |
106 | } | 106 | } |
@@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr) | |||
227 | return 0; | 227 | return 0; |
228 | } | 228 | } |
229 | 229 | ||
230 | /* | ||
231 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
232 | * is valid. The argument is a physical page number. | ||
233 | * | ||
234 | * | ||
235 | * On x86, access has to be given to the first megabyte of ram because that area | ||
236 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
237 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
238 | * mmio resources as well as potential bios/acpi data regions. | ||
239 | */ | ||
240 | int devmem_is_allowed(unsigned long pagenr) | ||
241 | { | ||
242 | if (pagenr <= 256) | ||
243 | return 1; | ||
244 | if (!page_is_ram(pagenr)) | ||
245 | return 1; | ||
246 | return 0; | ||
247 | } | ||
248 | |||
230 | #ifdef CONFIG_HIGHMEM | 249 | #ifdef CONFIG_HIGHMEM |
231 | pte_t *kmap_pte; | 250 | pte_t *kmap_pte; |
232 | pgprot_t kmap_prot; | 251 | pgprot_t kmap_prot; |
@@ -365,7 +384,7 @@ void __init native_pagetable_setup_start(pgd_t *base) | |||
365 | 384 | ||
366 | pte_clear(NULL, va, pte); | 385 | pte_clear(NULL, va, pte); |
367 | } | 386 | } |
368 | paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); | 387 | paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); |
369 | } | 388 | } |
370 | 389 | ||
371 | void __init native_pagetable_setup_done(pgd_t *base) | 390 | void __init native_pagetable_setup_done(pgd_t *base) |
@@ -457,7 +476,7 @@ void zap_low_mappings(void) | |||
457 | * Note that "pgd_clear()" doesn't do it for | 476 | * Note that "pgd_clear()" doesn't do it for |
458 | * us, because pgd_clear() is a no-op on i386. | 477 | * us, because pgd_clear() is a no-op on i386. |
459 | */ | 478 | */ |
460 | for (i = 0; i < USER_PTRS_PER_PGD; i++) { | 479 | for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { |
461 | #ifdef CONFIG_X86_PAE | 480 | #ifdef CONFIG_X86_PAE |
462 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | 481 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); |
463 | #else | 482 | #else |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ff7906a9a4d..0cca62663037 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void) | |||
135 | return ptr; | 135 | return ptr; |
136 | } | 136 | } |
137 | 137 | ||
138 | static __init void | 138 | static void |
139 | set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | 139 | set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) |
140 | { | 140 | { |
141 | pgd_t *pgd; | 141 | pgd_t *pgd; |
@@ -173,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | |||
173 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | 173 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); |
174 | 174 | ||
175 | pte = pte_offset_kernel(pmd, vaddr); | 175 | pte = pte_offset_kernel(pmd, vaddr); |
176 | if (!pte_none(*pte) && | 176 | if (!pte_none(*pte) && pte_val(new_pte) && |
177 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | 177 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) |
178 | pte_ERROR(*pte); | 178 | pte_ERROR(*pte); |
179 | set_pte(pte, new_pte); | 179 | set_pte(pte, new_pte); |
@@ -214,8 +214,7 @@ void __init cleanup_highmap(void) | |||
214 | } | 214 | } |
215 | 215 | ||
216 | /* NOTE: this is meant to be run only at boot */ | 216 | /* NOTE: this is meant to be run only at boot */ |
217 | void __init | 217 | void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) |
218 | __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
219 | { | 218 | { |
220 | unsigned long address = __fix_to_virt(idx); | 219 | unsigned long address = __fix_to_virt(idx); |
221 | 220 | ||
@@ -664,6 +663,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | |||
664 | 663 | ||
665 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 664 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
666 | 665 | ||
666 | /* | ||
667 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
668 | * is valid. The argument is a physical page number. | ||
669 | * | ||
670 | * | ||
671 | * On x86, access has to be given to the first megabyte of ram because that area | ||
672 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
673 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
674 | * mmio resources as well as potential bios/acpi data regions. | ||
675 | */ | ||
676 | int devmem_is_allowed(unsigned long pagenr) | ||
677 | { | ||
678 | if (pagenr <= 256) | ||
679 | return 1; | ||
680 | if (!page_is_ram(pagenr)) | ||
681 | return 1; | ||
682 | return 0; | ||
683 | } | ||
684 | |||
685 | |||
667 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, | 686 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, |
668 | kcore_modules, kcore_vsyscall; | 687 | kcore_modules, kcore_vsyscall; |
669 | 688 | ||
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3a4baf95e24d..d176b23110cc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -336,6 +336,35 @@ void iounmap(volatile void __iomem *addr) | |||
336 | } | 336 | } |
337 | EXPORT_SYMBOL(iounmap); | 337 | EXPORT_SYMBOL(iounmap); |
338 | 338 | ||
339 | /* | ||
340 | * Convert a physical pointer to a virtual kernel pointer for /dev/mem | ||
341 | * access | ||
342 | */ | ||
343 | void *xlate_dev_mem_ptr(unsigned long phys) | ||
344 | { | ||
345 | void *addr; | ||
346 | unsigned long start = phys & PAGE_MASK; | ||
347 | |||
348 | /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ | ||
349 | if (page_is_ram(start >> PAGE_SHIFT)) | ||
350 | return __va(phys); | ||
351 | |||
352 | addr = (void *)ioremap(start, PAGE_SIZE); | ||
353 | if (addr) | ||
354 | addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); | ||
355 | |||
356 | return addr; | ||
357 | } | ||
358 | |||
359 | void unxlate_dev_mem_ptr(unsigned long phys, void *addr) | ||
360 | { | ||
361 | if (page_is_ram(phys >> PAGE_SHIFT)) | ||
362 | return; | ||
363 | |||
364 | iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); | ||
365 | return; | ||
366 | } | ||
367 | |||
339 | #ifdef CONFIG_X86_32 | 368 | #ifdef CONFIG_X86_32 |
340 | 369 | ||
341 | int __initdata early_ioremap_debug; | 370 | int __initdata early_ioremap_debug; |
@@ -407,7 +436,7 @@ void __init early_ioremap_clear(void) | |||
407 | 436 | ||
408 | pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); | 437 | pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); |
409 | pmd_clear(pmd); | 438 | pmd_clear(pmd); |
410 | paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); | 439 | paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); |
411 | __flush_tlb_all(); | 440 | __flush_tlb_all(); |
412 | } | 441 | } |
413 | 442 | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c29ebd037254..bd5e05c654dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
483 | goto out_unlock; | 483 | goto out_unlock; |
484 | 484 | ||
485 | pbase = (pte_t *)page_address(base); | 485 | pbase = (pte_t *)page_address(base); |
486 | #ifdef CONFIG_X86_32 | 486 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); |
487 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); | ||
488 | #endif | ||
489 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | 487 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
490 | 488 | ||
491 | #ifdef CONFIG_X86_64 | 489 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 72c0f6097402..ef8b64b89c7d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/bootmem.h> | ||
14 | 15 | ||
15 | #include <asm/msr.h> | 16 | #include <asm/msr.h> |
16 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
@@ -21,6 +22,7 @@ | |||
21 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
22 | #include <asm/fcntl.h> | 23 | #include <asm/fcntl.h> |
23 | #include <asm/mtrr.h> | 24 | #include <asm/mtrr.h> |
25 | #include <asm/io.h> | ||
24 | 26 | ||
25 | int pat_wc_enabled = 1; | 27 | int pat_wc_enabled = 1; |
26 | 28 | ||
@@ -190,6 +192,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, | |||
190 | return 0; | 192 | return 0; |
191 | } | 193 | } |
192 | 194 | ||
195 | /* | ||
196 | * req_type typically has one of the: | ||
197 | * - _PAGE_CACHE_WB | ||
198 | * - _PAGE_CACHE_WC | ||
199 | * - _PAGE_CACHE_UC_MINUS | ||
200 | * - _PAGE_CACHE_UC | ||
201 | * | ||
202 | * req_type will have a special case value '-1', when requester want to inherit | ||
203 | * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. | ||
204 | * | ||
205 | * If ret_type is NULL, function will return an error if it cannot reserve the | ||
206 | * region with req_type. If ret_type is non-null, function will return | ||
207 | * available type in ret_type in case of no error. In case of any error | ||
208 | * it will return a negative return value. | ||
209 | */ | ||
193 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, | 210 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, |
194 | unsigned long *ret_type) | 211 | unsigned long *ret_type) |
195 | { | 212 | { |
@@ -200,9 +217,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
200 | 217 | ||
201 | /* Only track when pat_wc_enabled */ | 218 | /* Only track when pat_wc_enabled */ |
202 | if (!pat_wc_enabled) { | 219 | if (!pat_wc_enabled) { |
203 | if (ret_type) | 220 | /* This is identical to page table setting without PAT */ |
204 | *ret_type = req_type; | 221 | if (ret_type) { |
205 | 222 | if (req_type == -1) { | |
223 | *ret_type = _PAGE_CACHE_WB; | ||
224 | } else { | ||
225 | *ret_type = req_type; | ||
226 | } | ||
227 | } | ||
206 | return 0; | 228 | return 0; |
207 | } | 229 | } |
208 | 230 | ||
@@ -214,8 +236,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
214 | return 0; | 236 | return 0; |
215 | } | 237 | } |
216 | 238 | ||
217 | req_type &= _PAGE_CACHE_MASK; | 239 | if (req_type == -1) { |
218 | err = pat_x_mtrr_type(start, end, req_type, &actual_type); | 240 | /* |
241 | * Special case where caller wants to inherit from mtrr or | ||
242 | * existing pat mapping, defaulting to UC_MINUS in case of | ||
243 | * no match. | ||
244 | */ | ||
245 | u8 mtrr_type = mtrr_type_lookup(start, end); | ||
246 | if (mtrr_type == 0xFE) { /* MTRR match error */ | ||
247 | err = -1; | ||
248 | } | ||
249 | |||
250 | if (mtrr_type == MTRR_TYPE_WRBACK) { | ||
251 | req_type = _PAGE_CACHE_WB; | ||
252 | actual_type = _PAGE_CACHE_WB; | ||
253 | } else { | ||
254 | req_type = _PAGE_CACHE_UC_MINUS; | ||
255 | actual_type = _PAGE_CACHE_UC_MINUS; | ||
256 | } | ||
257 | } else { | ||
258 | req_type &= _PAGE_CACHE_MASK; | ||
259 | err = pat_x_mtrr_type(start, end, req_type, &actual_type); | ||
260 | } | ||
261 | |||
219 | if (err) { | 262 | if (err) { |
220 | if (ret_type) | 263 | if (ret_type) |
221 | *ret_type = actual_type; | 264 | *ret_type = actual_type; |
@@ -241,7 +284,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
241 | struct memtype *saved_ptr; | 284 | struct memtype *saved_ptr; |
242 | 285 | ||
243 | if (parse->start >= end) { | 286 | if (parse->start >= end) { |
244 | printk("New Entry\n"); | 287 | pr_debug("New Entry\n"); |
245 | list_add(&new_entry->nd, parse->nd.prev); | 288 | list_add(&new_entry->nd, parse->nd.prev); |
246 | new_entry = NULL; | 289 | new_entry = NULL; |
247 | break; | 290 | break; |
@@ -343,7 +386,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
343 | break; | 386 | break; |
344 | } | 387 | } |
345 | 388 | ||
346 | printk("Overlap at 0x%Lx-0x%Lx\n", | 389 | printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", |
347 | saved_ptr->start, saved_ptr->end); | 390 | saved_ptr->start, saved_ptr->end); |
348 | /* No conflict. Go ahead and add this new entry */ | 391 | /* No conflict. Go ahead and add this new entry */ |
349 | list_add(&new_entry->nd, &saved_ptr->nd); | 392 | list_add(&new_entry->nd, &saved_ptr->nd); |
@@ -353,7 +396,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
353 | } | 396 | } |
354 | 397 | ||
355 | if (err) { | 398 | if (err) { |
356 | printk( | 399 | printk(KERN_INFO |
357 | "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", | 400 | "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", |
358 | start, end, cattr_name(new_entry->type), | 401 | start, end, cattr_name(new_entry->type), |
359 | cattr_name(req_type)); | 402 | cattr_name(req_type)); |
@@ -365,16 +408,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
365 | if (new_entry) { | 408 | if (new_entry) { |
366 | /* No conflict. Not yet added to the list. Add to the tail */ | 409 | /* No conflict. Not yet added to the list. Add to the tail */ |
367 | list_add_tail(&new_entry->nd, &memtype_list); | 410 | list_add_tail(&new_entry->nd, &memtype_list); |
368 | printk("New Entry\n"); | 411 | pr_debug("New Entry\n"); |
369 | } | 412 | } |
370 | 413 | ||
371 | if (ret_type) { | 414 | if (ret_type) { |
372 | printk( | 415 | pr_debug( |
373 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", | 416 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", |
374 | start, end, cattr_name(actual_type), | 417 | start, end, cattr_name(actual_type), |
375 | cattr_name(req_type), cattr_name(*ret_type)); | 418 | cattr_name(req_type), cattr_name(*ret_type)); |
376 | } else { | 419 | } else { |
377 | printk( | 420 | pr_debug( |
378 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", | 421 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", |
379 | start, end, cattr_name(actual_type), | 422 | start, end, cattr_name(actual_type), |
380 | cattr_name(req_type)); | 423 | cattr_name(req_type)); |
@@ -411,11 +454,115 @@ int free_memtype(u64 start, u64 end) | |||
411 | spin_unlock(&memtype_lock); | 454 | spin_unlock(&memtype_lock); |
412 | 455 | ||
413 | if (err) { | 456 | if (err) { |
414 | printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", | 457 | printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", |
415 | current->comm, current->pid, start, end); | 458 | current->comm, current->pid, start, end); |
416 | } | 459 | } |
417 | 460 | ||
418 | printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); | 461 | pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); |
419 | return err; | 462 | return err; |
420 | } | 463 | } |
421 | 464 | ||
465 | |||
466 | /* | ||
467 | * /dev/mem mmap interface. The memtype used for mapping varies: | ||
468 | * - Use UC for mappings with O_SYNC flag | ||
469 | * - Without O_SYNC flag, if there is any conflict in reserve_memtype, | ||
470 | * inherit the memtype from existing mapping. | ||
471 | * - Else use UC_MINUS memtype (for backward compatibility with existing | ||
472 | * X drivers. | ||
473 | */ | ||
474 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | ||
475 | unsigned long size, pgprot_t vma_prot) | ||
476 | { | ||
477 | return vma_prot; | ||
478 | } | ||
479 | |||
480 | int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | ||
481 | unsigned long size, pgprot_t *vma_prot) | ||
482 | { | ||
483 | u64 offset = ((u64) pfn) << PAGE_SHIFT; | ||
484 | unsigned long flags = _PAGE_CACHE_UC_MINUS; | ||
485 | unsigned long ret_flags; | ||
486 | int retval; | ||
487 | |||
488 | if (file->f_flags & O_SYNC) { | ||
489 | flags = _PAGE_CACHE_UC; | ||
490 | } | ||
491 | |||
492 | #ifdef CONFIG_X86_32 | ||
493 | /* | ||
494 | * On the PPro and successors, the MTRRs are used to set | ||
495 | * memory types for physical addresses outside main memory, | ||
496 | * so blindly setting UC or PWT on those pages is wrong. | ||
497 | * For Pentiums and earlier, the surround logic should disable | ||
498 | * caching for the high addresses through the KEN pin, but | ||
499 | * we maintain the tradition of paranoia in this code. | ||
500 | */ | ||
501 | if (!pat_wc_enabled && | ||
502 | ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || | ||
503 | test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || | ||
504 | test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || | ||
505 | test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && | ||
506 | (pfn << PAGE_SHIFT) >= __pa(high_memory)) { | ||
507 | flags = _PAGE_CACHE_UC; | ||
508 | } | ||
509 | #endif | ||
510 | |||
511 | /* | ||
512 | * With O_SYNC, we can only take UC mapping. Fail if we cannot. | ||
513 | * Without O_SYNC, we want to get | ||
514 | * - WB for WB-able memory and no other conflicting mappings | ||
515 | * - UC_MINUS for non-WB-able memory with no other conflicting mappings | ||
516 | * - Inherit from confliting mappings otherwise | ||
517 | */ | ||
518 | if (flags != _PAGE_CACHE_UC_MINUS) { | ||
519 | retval = reserve_memtype(offset, offset + size, flags, NULL); | ||
520 | } else { | ||
521 | retval = reserve_memtype(offset, offset + size, -1, &ret_flags); | ||
522 | } | ||
523 | |||
524 | if (retval < 0) | ||
525 | return 0; | ||
526 | |||
527 | flags = ret_flags; | ||
528 | |||
529 | if (pfn <= max_pfn_mapped && | ||
530 | ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { | ||
531 | free_memtype(offset, offset + size); | ||
532 | printk(KERN_INFO | ||
533 | "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", | ||
534 | current->comm, current->pid, | ||
535 | cattr_name(flags), | ||
536 | offset, offset + size); | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | | ||
541 | flags); | ||
542 | return 1; | ||
543 | } | ||
544 | |||
545 | void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | ||
546 | { | ||
547 | u64 addr = (u64)pfn << PAGE_SHIFT; | ||
548 | unsigned long flags; | ||
549 | unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); | ||
550 | |||
551 | reserve_memtype(addr, addr + size, want_flags, &flags); | ||
552 | if (flags != want_flags) { | ||
553 | printk(KERN_INFO | ||
554 | "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", | ||
555 | current->comm, current->pid, | ||
556 | cattr_name(want_flags), | ||
557 | addr, addr + size, | ||
558 | cattr_name(flags)); | ||
559 | } | ||
560 | } | ||
561 | |||
562 | void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | ||
563 | { | ||
564 | u64 addr = (u64)pfn << PAGE_SHIFT; | ||
565 | |||
566 | free_memtype(addr, addr + size); | ||
567 | } | ||
568 | |||
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 000000000000..50159764f694 --- /dev/null +++ b/arch/x86/mm/pgtable.c | |||
@@ -0,0 +1,276 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <asm/pgalloc.h> | ||
3 | #include <asm/pgtable.h> | ||
4 | #include <asm/tlb.h> | ||
5 | |||
6 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | ||
7 | { | ||
8 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | ||
9 | } | ||
10 | |||
11 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | ||
12 | { | ||
13 | struct page *pte; | ||
14 | |||
15 | #ifdef CONFIG_HIGHPTE | ||
16 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | ||
17 | #else | ||
18 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | ||
19 | #endif | ||
20 | if (pte) | ||
21 | pgtable_page_ctor(pte); | ||
22 | return pte; | ||
23 | } | ||
24 | |||
25 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | ||
26 | { | ||
27 | pgtable_page_dtor(pte); | ||
28 | paravirt_release_pte(page_to_pfn(pte)); | ||
29 | tlb_remove_page(tlb, pte); | ||
30 | } | ||
31 | |||
32 | #if PAGETABLE_LEVELS > 2 | ||
33 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | ||
34 | { | ||
35 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); | ||
36 | tlb_remove_page(tlb, virt_to_page(pmd)); | ||
37 | } | ||
38 | |||
39 | #if PAGETABLE_LEVELS > 3 | ||
40 | void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) | ||
41 | { | ||
42 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); | ||
43 | tlb_remove_page(tlb, virt_to_page(pud)); | ||
44 | } | ||
45 | #endif /* PAGETABLE_LEVELS > 3 */ | ||
46 | #endif /* PAGETABLE_LEVELS > 2 */ | ||
47 | |||
48 | static inline void pgd_list_add(pgd_t *pgd) | ||
49 | { | ||
50 | struct page *page = virt_to_page(pgd); | ||
51 | |||
52 | list_add(&page->lru, &pgd_list); | ||
53 | } | ||
54 | |||
55 | static inline void pgd_list_del(pgd_t *pgd) | ||
56 | { | ||
57 | struct page *page = virt_to_page(pgd); | ||
58 | |||
59 | list_del(&page->lru); | ||
60 | } | ||
61 | |||
62 | #define UNSHARED_PTRS_PER_PGD \ | ||
63 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | ||
64 | |||
65 | static void pgd_ctor(void *p) | ||
66 | { | ||
67 | pgd_t *pgd = p; | ||
68 | unsigned long flags; | ||
69 | |||
70 | /* Clear usermode parts of PGD */ | ||
71 | memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); | ||
72 | |||
73 | spin_lock_irqsave(&pgd_lock, flags); | ||
74 | |||
75 | /* If the pgd points to a shared pagetable level (either the | ||
76 | ptes in non-PAE, or shared PMD in PAE), then just copy the | ||
77 | references from swapper_pg_dir. */ | ||
78 | if (PAGETABLE_LEVELS == 2 || | ||
79 | (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || | ||
80 | PAGETABLE_LEVELS == 4) { | ||
81 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, | ||
82 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
83 | KERNEL_PGD_PTRS); | ||
84 | paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, | ||
85 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | ||
86 | KERNEL_PGD_BOUNDARY, | ||
87 | KERNEL_PGD_PTRS); | ||
88 | } | ||
89 | |||
90 | /* list required to sync kernel mapping updates */ | ||
91 | if (!SHARED_KERNEL_PMD) | ||
92 | pgd_list_add(pgd); | ||
93 | |||
94 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
95 | } | ||
96 | |||
97 | static void pgd_dtor(void *pgd) | ||
98 | { | ||
99 | unsigned long flags; /* can be called from interrupt context */ | ||
100 | |||
101 | if (SHARED_KERNEL_PMD) | ||
102 | return; | ||
103 | |||
104 | spin_lock_irqsave(&pgd_lock, flags); | ||
105 | pgd_list_del(pgd); | ||
106 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * List of all pgd's needed for non-PAE so it can invalidate entries | ||
111 | * in both cached and uncached pgd's; not needed for PAE since the | ||
112 | * kernel pmd is shared. If PAE were not to share the pmd a similar | ||
113 | * tactic would be needed. This is essentially codepath-based locking | ||
114 | * against pageattr.c; it is the unique case in which a valid change | ||
115 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | ||
116 | * vmalloc faults work because attached pagetables are never freed. | ||
117 | * -- wli | ||
118 | */ | ||
119 | |||
120 | #ifdef CONFIG_X86_PAE | ||
121 | /* | ||
122 | * Mop up any pmd pages which may still be attached to the pgd. | ||
123 | * Normally they will be freed by munmap/exit_mmap, but any pmd we | ||
124 | * preallocate which never got a corresponding vma will need to be | ||
125 | * freed manually. | ||
126 | */ | ||
127 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | ||
132 | pgd_t pgd = pgdp[i]; | ||
133 | |||
134 | if (pgd_val(pgd) != 0) { | ||
135 | pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); | ||
136 | |||
137 | pgdp[i] = native_make_pgd(0); | ||
138 | |||
139 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); | ||
140 | pmd_free(mm, pmd); | ||
141 | } | ||
142 | } | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
147 | * updating the top-level pagetable entries to guarantee the | ||
148 | * processor notices the update. Since this is expensive, and | ||
149 | * all 4 top-level entries are used almost immediately in a | ||
150 | * new process's life, we just pre-populate them here. | ||
151 | * | ||
152 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
153 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
154 | * and initialize the kernel pmds here. | ||
155 | */ | ||
156 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
157 | { | ||
158 | pud_t *pud; | ||
159 | unsigned long addr; | ||
160 | int i; | ||
161 | |||
162 | pud = pud_offset(pgd, 0); | ||
163 | for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | ||
164 | i++, pud++, addr += PUD_SIZE) { | ||
165 | pmd_t *pmd = pmd_alloc_one(mm, addr); | ||
166 | |||
167 | if (!pmd) { | ||
168 | pgd_mop_up_pmds(mm, pgd); | ||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | if (i >= KERNEL_PGD_BOUNDARY) | ||
173 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | ||
174 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
175 | |||
176 | pud_populate(mm, pud, pmd); | ||
177 | } | ||
178 | |||
179 | return 1; | ||
180 | } | ||
181 | |||
182 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | ||
183 | { | ||
184 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | ||
185 | |||
186 | /* Note: almost everything apart from _PAGE_PRESENT is | ||
187 | reserved at the pmd (PDPT) level. */ | ||
188 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
189 | |||
190 | /* | ||
191 | * According to Intel App note "TLBs, Paging-Structure Caches, | ||
192 | * and Their Invalidation", April 2007, document 317080-001, | ||
193 | * section 8.1: in PAE mode we explicitly have to flush the | ||
194 | * TLB via cr3 if the top-level pgd is changed... | ||
195 | */ | ||
196 | if (mm == current->active_mm) | ||
197 | write_cr3(read_cr3()); | ||
198 | } | ||
199 | #else /* !CONFIG_X86_PAE */ | ||
200 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
201 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
202 | { | ||
203 | return 1; | ||
204 | } | ||
205 | |||
206 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) | ||
207 | { | ||
208 | } | ||
209 | #endif /* CONFIG_X86_PAE */ | ||
210 | |||
211 | pgd_t *pgd_alloc(struct mm_struct *mm) | ||
212 | { | ||
213 | pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
214 | |||
215 | /* so that alloc_pmd can use it */ | ||
216 | mm->pgd = pgd; | ||
217 | if (pgd) | ||
218 | pgd_ctor(pgd); | ||
219 | |||
220 | if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | ||
221 | pgd_dtor(pgd); | ||
222 | free_page((unsigned long)pgd); | ||
223 | pgd = NULL; | ||
224 | } | ||
225 | |||
226 | return pgd; | ||
227 | } | ||
228 | |||
229 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | ||
230 | { | ||
231 | pgd_mop_up_pmds(mm, pgd); | ||
232 | pgd_dtor(pgd); | ||
233 | free_page((unsigned long)pgd); | ||
234 | } | ||
235 | |||
236 | int ptep_set_access_flags(struct vm_area_struct *vma, | ||
237 | unsigned long address, pte_t *ptep, | ||
238 | pte_t entry, int dirty) | ||
239 | { | ||
240 | int changed = !pte_same(*ptep, entry); | ||
241 | |||
242 | if (changed && dirty) { | ||
243 | *ptep = entry; | ||
244 | pte_update_defer(vma->vm_mm, address, ptep); | ||
245 | flush_tlb_page(vma, address); | ||
246 | } | ||
247 | |||
248 | return changed; | ||
249 | } | ||
250 | |||
251 | int ptep_test_and_clear_young(struct vm_area_struct *vma, | ||
252 | unsigned long addr, pte_t *ptep) | ||
253 | { | ||
254 | int ret = 0; | ||
255 | |||
256 | if (pte_young(*ptep)) | ||
257 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | ||
258 | &ptep->pte); | ||
259 | |||
260 | if (ret) | ||
261 | pte_update(vma->vm_mm, addr, ptep); | ||
262 | |||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | int ptep_clear_flush_young(struct vm_area_struct *vma, | ||
267 | unsigned long address, pte_t *ptep) | ||
268 | { | ||
269 | int young; | ||
270 | |||
271 | young = ptep_test_and_clear_young(vma, address, ptep); | ||
272 | if (young) | ||
273 | flush_tlb_page(vma, address); | ||
274 | |||
275 | return young; | ||
276 | } | ||
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6fb9e7c6893f..9ee007be9142 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve) | |||
173 | __VMALLOC_RESERVE += reserve; | 173 | __VMALLOC_RESERVE += reserve; |
174 | } | 174 | } |
175 | 175 | ||
176 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | ||
177 | { | ||
178 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | ||
179 | } | ||
180 | |||
181 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | ||
182 | { | ||
183 | struct page *pte; | ||
184 | |||
185 | #ifdef CONFIG_HIGHPTE | ||
186 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | ||
187 | #else | ||
188 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | ||
189 | #endif | ||
190 | if (pte) | ||
191 | pgtable_page_ctor(pte); | ||
192 | return pte; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * List of all pgd's needed for non-PAE so it can invalidate entries | ||
197 | * in both cached and uncached pgd's; not needed for PAE since the | ||
198 | * kernel pmd is shared. If PAE were not to share the pmd a similar | ||
199 | * tactic would be needed. This is essentially codepath-based locking | ||
200 | * against pageattr.c; it is the unique case in which a valid change | ||
201 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | ||
202 | * vmalloc faults work because attached pagetables are never freed. | ||
203 | * -- wli | ||
204 | */ | ||
205 | static inline void pgd_list_add(pgd_t *pgd) | ||
206 | { | ||
207 | struct page *page = virt_to_page(pgd); | ||
208 | |||
209 | list_add(&page->lru, &pgd_list); | ||
210 | } | ||
211 | |||
212 | static inline void pgd_list_del(pgd_t *pgd) | ||
213 | { | ||
214 | struct page *page = virt_to_page(pgd); | ||
215 | |||
216 | list_del(&page->lru); | ||
217 | } | ||
218 | |||
219 | #define UNSHARED_PTRS_PER_PGD \ | ||
220 | (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | ||
221 | |||
222 | static void pgd_ctor(void *p) | ||
223 | { | ||
224 | pgd_t *pgd = p; | ||
225 | unsigned long flags; | ||
226 | |||
227 | /* Clear usermode parts of PGD */ | ||
228 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | ||
229 | |||
230 | spin_lock_irqsave(&pgd_lock, flags); | ||
231 | |||
232 | /* If the pgd points to a shared pagetable level (either the | ||
233 | ptes in non-PAE, or shared PMD in PAE), then just copy the | ||
234 | references from swapper_pg_dir. */ | ||
235 | if (PAGETABLE_LEVELS == 2 || | ||
236 | (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { | ||
237 | clone_pgd_range(pgd + USER_PTRS_PER_PGD, | ||
238 | swapper_pg_dir + USER_PTRS_PER_PGD, | ||
239 | KERNEL_PGD_PTRS); | ||
240 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | ||
241 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | ||
242 | USER_PTRS_PER_PGD, | ||
243 | KERNEL_PGD_PTRS); | ||
244 | } | ||
245 | |||
246 | /* list required to sync kernel mapping updates */ | ||
247 | if (!SHARED_KERNEL_PMD) | ||
248 | pgd_list_add(pgd); | ||
249 | |||
250 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
251 | } | ||
252 | |||
253 | static void pgd_dtor(void *pgd) | ||
254 | { | ||
255 | unsigned long flags; /* can be called from interrupt context */ | ||
256 | |||
257 | if (SHARED_KERNEL_PMD) | ||
258 | return; | ||
259 | |||
260 | spin_lock_irqsave(&pgd_lock, flags); | ||
261 | pgd_list_del(pgd); | ||
262 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
263 | } | ||
264 | |||
265 | #ifdef CONFIG_X86_PAE | ||
266 | /* | ||
267 | * Mop up any pmd pages which may still be attached to the pgd. | ||
268 | * Normally they will be freed by munmap/exit_mmap, but any pmd we | ||
269 | * preallocate which never got a corresponding vma will need to be | ||
270 | * freed manually. | ||
271 | */ | ||
272 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | ||
273 | { | ||
274 | int i; | ||
275 | |||
276 | for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | ||
277 | pgd_t pgd = pgdp[i]; | ||
278 | |||
279 | if (pgd_val(pgd) != 0) { | ||
280 | pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); | ||
281 | |||
282 | pgdp[i] = native_make_pgd(0); | ||
283 | |||
284 | paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); | ||
285 | pmd_free(mm, pmd); | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
292 | * updating the top-level pagetable entries to guarantee the | ||
293 | * processor notices the update. Since this is expensive, and | ||
294 | * all 4 top-level entries are used almost immediately in a | ||
295 | * new process's life, we just pre-populate them here. | ||
296 | * | ||
297 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
298 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
299 | * and initialize the kernel pmds here. | ||
300 | */ | ||
301 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
302 | { | ||
303 | pud_t *pud; | ||
304 | unsigned long addr; | ||
305 | int i; | ||
306 | |||
307 | pud = pud_offset(pgd, 0); | ||
308 | for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | ||
309 | i++, pud++, addr += PUD_SIZE) { | ||
310 | pmd_t *pmd = pmd_alloc_one(mm, addr); | ||
311 | |||
312 | if (!pmd) { | ||
313 | pgd_mop_up_pmds(mm, pgd); | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | if (i >= USER_PTRS_PER_PGD) | ||
318 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | ||
319 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
320 | |||
321 | pud_populate(mm, pud, pmd); | ||
322 | } | ||
323 | |||
324 | return 1; | ||
325 | } | ||
326 | #else /* !CONFIG_X86_PAE */ | ||
327 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
328 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
329 | { | ||
330 | return 1; | ||
331 | } | ||
332 | |||
333 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | ||
334 | { | ||
335 | } | ||
336 | #endif /* CONFIG_X86_PAE */ | ||
337 | |||
338 | pgd_t *pgd_alloc(struct mm_struct *mm) | ||
339 | { | ||
340 | pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
341 | |||
342 | /* so that alloc_pd can use it */ | ||
343 | mm->pgd = pgd; | ||
344 | if (pgd) | ||
345 | pgd_ctor(pgd); | ||
346 | |||
347 | if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | ||
348 | pgd_dtor(pgd); | ||
349 | free_page((unsigned long)pgd); | ||
350 | pgd = NULL; | ||
351 | } | ||
352 | |||
353 | return pgd; | ||
354 | } | ||
355 | |||
356 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | ||
357 | { | ||
358 | pgd_mop_up_pmds(mm, pgd); | ||
359 | pgd_dtor(pgd); | ||
360 | free_page((unsigned long)pgd); | ||
361 | } | ||
362 | |||
363 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | ||
364 | { | ||
365 | pgtable_page_dtor(pte); | ||
366 | paravirt_release_pt(page_to_pfn(pte)); | ||
367 | tlb_remove_page(tlb, pte); | ||
368 | } | ||
369 | |||
370 | #ifdef CONFIG_X86_PAE | ||
371 | |||
372 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | ||
373 | { | ||
374 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | ||
375 | tlb_remove_page(tlb, virt_to_page(pmd)); | ||
376 | } | ||
377 | |||
378 | #endif | ||
379 | |||
380 | int pmd_bad(pmd_t pmd) | 176 | int pmd_bad(pmd_t pmd) |
381 | { | 177 | { |
382 | WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); | 178 | WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); |
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f2649bee4..2e641be2737e 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -6,7 +6,7 @@ config XEN | |||
6 | bool "Xen guest support" | 6 | bool "Xen guest support" |
7 | select PARAVIRT | 7 | select PARAVIRT |
8 | depends on X86_32 | 8 | depends on X86_32 |
9 | depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) | 9 | depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) |
10 | help | 10 | help |
11 | This is the Linux Xen port. Enabling this will allow the | 11 | This is the Linux Xen port. Enabling this will allow the |
12 | kernel to boot in a paravirtualized environment under the | 12 | kernel to boot in a paravirtualized environment under the |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 343df246bd3e..3d8df981d5fd 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ | 1 | obj-y := enlighten.o setup.o multicalls.o mmu.o \ |
2 | events.o time.o manage.o xen-asm.o | 2 | time.o manage.o xen-asm.o grant-table.o |
3 | 3 | ||
4 | obj-$(CONFIG_SMP) += smp.o | 4 | obj-$(CONFIG_SMP) += smp.o |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0388220cf97..c8a56e457d61 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -155,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, | |||
155 | if (*ax == 1) | 155 | if (*ax == 1) |
156 | maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ | 156 | maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ |
157 | (1 << X86_FEATURE_ACPI) | /* disable ACPI */ | 157 | (1 << X86_FEATURE_ACPI) | /* disable ACPI */ |
158 | (1 << X86_FEATURE_SEP) | /* disable SEP */ | 158 | (1 << X86_FEATURE_MCE) | /* disable MCE */ |
159 | (1 << X86_FEATURE_MCA) | /* disable MCA */ | ||
159 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ | 160 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ |
160 | 161 | ||
161 | asm(XEN_EMULATE_PREFIX "cpuid" | 162 | asm(XEN_EMULATE_PREFIX "cpuid" |
@@ -531,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val) | |||
531 | static void xen_flush_tlb(void) | 532 | static void xen_flush_tlb(void) |
532 | { | 533 | { |
533 | struct mmuext_op *op; | 534 | struct mmuext_op *op; |
534 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | 535 | struct multicall_space mcs; |
536 | |||
537 | preempt_disable(); | ||
538 | |||
539 | mcs = xen_mc_entry(sizeof(*op)); | ||
535 | 540 | ||
536 | op = mcs.args; | 541 | op = mcs.args; |
537 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; | 542 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; |
538 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 543 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
539 | 544 | ||
540 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 545 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
546 | |||
547 | preempt_enable(); | ||
541 | } | 548 | } |
542 | 549 | ||
543 | static void xen_flush_tlb_single(unsigned long addr) | 550 | static void xen_flush_tlb_single(unsigned long addr) |
544 | { | 551 | { |
545 | struct mmuext_op *op; | 552 | struct mmuext_op *op; |
546 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | 553 | struct multicall_space mcs; |
554 | |||
555 | preempt_disable(); | ||
547 | 556 | ||
557 | mcs = xen_mc_entry(sizeof(*op)); | ||
548 | op = mcs.args; | 558 | op = mcs.args; |
549 | op->cmd = MMUEXT_INVLPG_LOCAL; | 559 | op->cmd = MMUEXT_INVLPG_LOCAL; |
550 | op->arg1.linear_addr = addr & PAGE_MASK; | 560 | op->arg1.linear_addr = addr & PAGE_MASK; |
551 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 561 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
552 | 562 | ||
553 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 563 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
564 | |||
565 | preempt_enable(); | ||
554 | } | 566 | } |
555 | 567 | ||
556 | static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, | 568 | static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, |
@@ -655,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3) | |||
655 | 667 | ||
656 | /* Early in boot, while setting up the initial pagetable, assume | 668 | /* Early in boot, while setting up the initial pagetable, assume |
657 | everything is pinned. */ | 669 | everything is pinned. */ |
658 | static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | 670 | static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) |
659 | { | 671 | { |
672 | #ifdef CONFIG_FLATMEM | ||
660 | BUG_ON(mem_map); /* should only be used early */ | 673 | BUG_ON(mem_map); /* should only be used early */ |
674 | #endif | ||
661 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 675 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
662 | } | 676 | } |
663 | 677 | ||
664 | /* Early release_pt assumes that all pts are pinned, since there's | 678 | /* Early release_pte assumes that all pts are pinned, since there's |
665 | only init_mm and anything attached to that is pinned. */ | 679 | only init_mm and anything attached to that is pinned. */ |
666 | static void xen_release_pt_init(u32 pfn) | 680 | static void xen_release_pte_init(u32 pfn) |
667 | { | 681 | { |
668 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 682 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
669 | } | 683 | } |
@@ -697,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) | |||
697 | } | 711 | } |
698 | } | 712 | } |
699 | 713 | ||
700 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 714 | static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) |
701 | { | 715 | { |
702 | xen_alloc_ptpage(mm, pfn, PT_PTE); | 716 | xen_alloc_ptpage(mm, pfn, PT_PTE); |
703 | } | 717 | } |
704 | 718 | ||
705 | static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) | 719 | static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) |
706 | { | 720 | { |
707 | xen_alloc_ptpage(mm, pfn, PT_PMD); | 721 | xen_alloc_ptpage(mm, pfn, PT_PMD); |
708 | } | 722 | } |
@@ -722,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level) | |||
722 | } | 736 | } |
723 | } | 737 | } |
724 | 738 | ||
725 | static void xen_release_pt(u32 pfn) | 739 | static void xen_release_pte(u32 pfn) |
726 | { | 740 | { |
727 | xen_release_ptpage(pfn, PT_PTE); | 741 | xen_release_ptpage(pfn, PT_PTE); |
728 | } | 742 | } |
729 | 743 | ||
730 | static void xen_release_pd(u32 pfn) | 744 | static void xen_release_pmd(u32 pfn) |
731 | { | 745 | { |
732 | xen_release_ptpage(pfn, PT_PMD); | 746 | xen_release_ptpage(pfn, PT_PMD); |
733 | } | 747 | } |
@@ -849,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
849 | { | 863 | { |
850 | /* This will work as long as patching hasn't happened yet | 864 | /* This will work as long as patching hasn't happened yet |
851 | (which it hasn't) */ | 865 | (which it hasn't) */ |
852 | pv_mmu_ops.alloc_pt = xen_alloc_pt; | 866 | pv_mmu_ops.alloc_pte = xen_alloc_pte; |
853 | pv_mmu_ops.alloc_pd = xen_alloc_pd; | 867 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; |
854 | pv_mmu_ops.release_pt = xen_release_pt; | 868 | pv_mmu_ops.release_pte = xen_release_pte; |
855 | pv_mmu_ops.release_pd = xen_release_pd; | 869 | pv_mmu_ops.release_pmd = xen_release_pmd; |
856 | pv_mmu_ops.set_pte = xen_set_pte; | 870 | pv_mmu_ops.set_pte = xen_set_pte; |
857 | 871 | ||
858 | setup_shared_info(); | 872 | setup_shared_info(); |
@@ -994,7 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { | |||
994 | .read_pmc = native_read_pmc, | 1008 | .read_pmc = native_read_pmc, |
995 | 1009 | ||
996 | .iret = xen_iret, | 1010 | .iret = xen_iret, |
997 | .irq_enable_syscall_ret = NULL, /* never called */ | 1011 | .irq_enable_syscall_ret = xen_sysexit, |
998 | 1012 | ||
999 | .load_tr_desc = paravirt_nop, | 1013 | .load_tr_desc = paravirt_nop, |
1000 | .set_ldt = xen_set_ldt, | 1014 | .set_ldt = xen_set_ldt, |
@@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1059 | .pte_update = paravirt_nop, | 1073 | .pte_update = paravirt_nop, |
1060 | .pte_update_defer = paravirt_nop, | 1074 | .pte_update_defer = paravirt_nop, |
1061 | 1075 | ||
1062 | .alloc_pt = xen_alloc_pt_init, | 1076 | .alloc_pte = xen_alloc_pte_init, |
1063 | .release_pt = xen_release_pt_init, | 1077 | .release_pte = xen_release_pte_init, |
1064 | .alloc_pd = xen_alloc_pt_init, | 1078 | .alloc_pmd = xen_alloc_pte_init, |
1065 | .alloc_pd_clone = paravirt_nop, | 1079 | .alloc_pmd_clone = paravirt_nop, |
1066 | .release_pd = xen_release_pt_init, | 1080 | .release_pmd = xen_release_pte_init, |
1067 | 1081 | ||
1068 | #ifdef CONFIG_HIGHPTE | 1082 | #ifdef CONFIG_HIGHPTE |
1069 | .kmap_atomic_pte = xen_kmap_atomic_pte, | 1083 | .kmap_atomic_pte = xen_kmap_atomic_pte, |
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c deleted file mode 100644 index dcf613e17581..000000000000 --- a/arch/x86/xen/events.c +++ /dev/null | |||
@@ -1,591 +0,0 @@ | |||
1 | /* | ||
2 | * Xen event channels | ||
3 | * | ||
4 | * Xen models interrupts with abstract event channels. Because each | ||
5 | * domain gets 1024 event channels, but NR_IRQ is not that large, we | ||
6 | * must dynamically map irqs<->event channels. The event channels | ||
7 | * interface with the rest of the kernel by defining a xen interrupt | ||
8 | * chip. When an event is recieved, it is mapped to an irq and sent | ||
9 | * through the normal interrupt processing path. | ||
10 | * | ||
11 | * There are four kinds of events which can be mapped to an event | ||
12 | * channel: | ||
13 | * | ||
14 | * 1. Inter-domain notifications. This includes all the virtual | ||
15 | * device events, since they're driven by front-ends in another domain | ||
16 | * (typically dom0). | ||
17 | * 2. VIRQs, typically used for timers. These are per-cpu events. | ||
18 | * 3. IPIs. | ||
19 | * 4. Hardware interrupts. Not supported at present. | ||
20 | * | ||
21 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | ||
22 | */ | ||
23 | |||
24 | #include <linux/linkage.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/string.h> | ||
29 | |||
30 | #include <asm/ptrace.h> | ||
31 | #include <asm/irq.h> | ||
32 | #include <asm/sync_bitops.h> | ||
33 | #include <asm/xen/hypercall.h> | ||
34 | #include <asm/xen/hypervisor.h> | ||
35 | |||
36 | #include <xen/events.h> | ||
37 | #include <xen/interface/xen.h> | ||
38 | #include <xen/interface/event_channel.h> | ||
39 | |||
40 | #include "xen-ops.h" | ||
41 | |||
42 | /* | ||
43 | * This lock protects updates to the following mapping and reference-count | ||
44 | * arrays. The lock does not need to be acquired to read the mapping tables. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(irq_mapping_update_lock); | ||
47 | |||
48 | /* IRQ <-> VIRQ mapping. */ | ||
49 | static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; | ||
50 | |||
51 | /* IRQ <-> IPI mapping */ | ||
52 | static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; | ||
53 | |||
54 | /* Packed IRQ information: binding type, sub-type index, and event channel. */ | ||
55 | struct packed_irq | ||
56 | { | ||
57 | unsigned short evtchn; | ||
58 | unsigned char index; | ||
59 | unsigned char type; | ||
60 | }; | ||
61 | |||
62 | static struct packed_irq irq_info[NR_IRQS]; | ||
63 | |||
64 | /* Binding types. */ | ||
65 | enum { | ||
66 | IRQT_UNBOUND, | ||
67 | IRQT_PIRQ, | ||
68 | IRQT_VIRQ, | ||
69 | IRQT_IPI, | ||
70 | IRQT_EVTCHN | ||
71 | }; | ||
72 | |||
73 | /* Convenient shorthand for packed representation of an unbound IRQ. */ | ||
74 | #define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) | ||
75 | |||
76 | static int evtchn_to_irq[NR_EVENT_CHANNELS] = { | ||
77 | [0 ... NR_EVENT_CHANNELS-1] = -1 | ||
78 | }; | ||
79 | static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; | ||
80 | static u8 cpu_evtchn[NR_EVENT_CHANNELS]; | ||
81 | |||
82 | /* Reference counts for bindings to IRQs. */ | ||
83 | static int irq_bindcount[NR_IRQS]; | ||
84 | |||
85 | /* Xen will never allocate port zero for any purpose. */ | ||
86 | #define VALID_EVTCHN(chn) ((chn) != 0) | ||
87 | |||
88 | /* | ||
89 | * Force a proper event-channel callback from Xen after clearing the | ||
90 | * callback mask. We do this in a very simple manner, by making a call | ||
91 | * down into Xen. The pending flag will be checked by Xen on return. | ||
92 | */ | ||
93 | void force_evtchn_callback(void) | ||
94 | { | ||
95 | (void)HYPERVISOR_xen_version(0, NULL); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(force_evtchn_callback); | ||
98 | |||
99 | static struct irq_chip xen_dynamic_chip; | ||
100 | |||
101 | /* Constructor for packed IRQ information. */ | ||
102 | static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) | ||
103 | { | ||
104 | return (struct packed_irq) { evtchn, index, type }; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Accessors for packed IRQ information. | ||
109 | */ | ||
110 | static inline unsigned int evtchn_from_irq(int irq) | ||
111 | { | ||
112 | return irq_info[irq].evtchn; | ||
113 | } | ||
114 | |||
115 | static inline unsigned int index_from_irq(int irq) | ||
116 | { | ||
117 | return irq_info[irq].index; | ||
118 | } | ||
119 | |||
120 | static inline unsigned int type_from_irq(int irq) | ||
121 | { | ||
122 | return irq_info[irq].type; | ||
123 | } | ||
124 | |||
125 | static inline unsigned long active_evtchns(unsigned int cpu, | ||
126 | struct shared_info *sh, | ||
127 | unsigned int idx) | ||
128 | { | ||
129 | return (sh->evtchn_pending[idx] & | ||
130 | cpu_evtchn_mask[cpu][idx] & | ||
131 | ~sh->evtchn_mask[idx]); | ||
132 | } | ||
133 | |||
134 | static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) | ||
135 | { | ||
136 | int irq = evtchn_to_irq[chn]; | ||
137 | |||
138 | BUG_ON(irq == -1); | ||
139 | #ifdef CONFIG_SMP | ||
140 | irq_desc[irq].affinity = cpumask_of_cpu(cpu); | ||
141 | #endif | ||
142 | |||
143 | __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); | ||
144 | __set_bit(chn, cpu_evtchn_mask[cpu]); | ||
145 | |||
146 | cpu_evtchn[chn] = cpu; | ||
147 | } | ||
148 | |||
149 | static void init_evtchn_cpu_bindings(void) | ||
150 | { | ||
151 | #ifdef CONFIG_SMP | ||
152 | int i; | ||
153 | /* By default all event channels notify CPU#0. */ | ||
154 | for (i = 0; i < NR_IRQS; i++) | ||
155 | irq_desc[i].affinity = cpumask_of_cpu(0); | ||
156 | #endif | ||
157 | |||
158 | memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); | ||
159 | memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); | ||
160 | } | ||
161 | |||
162 | static inline unsigned int cpu_from_evtchn(unsigned int evtchn) | ||
163 | { | ||
164 | return cpu_evtchn[evtchn]; | ||
165 | } | ||
166 | |||
167 | static inline void clear_evtchn(int port) | ||
168 | { | ||
169 | struct shared_info *s = HYPERVISOR_shared_info; | ||
170 | sync_clear_bit(port, &s->evtchn_pending[0]); | ||
171 | } | ||
172 | |||
173 | static inline void set_evtchn(int port) | ||
174 | { | ||
175 | struct shared_info *s = HYPERVISOR_shared_info; | ||
176 | sync_set_bit(port, &s->evtchn_pending[0]); | ||
177 | } | ||
178 | |||
179 | |||
180 | /** | ||
181 | * notify_remote_via_irq - send event to remote end of event channel via irq | ||
182 | * @irq: irq of event channel to send event to | ||
183 | * | ||
184 | * Unlike notify_remote_via_evtchn(), this is safe to use across | ||
185 | * save/restore. Notifications on a broken connection are silently | ||
186 | * dropped. | ||
187 | */ | ||
188 | void notify_remote_via_irq(int irq) | ||
189 | { | ||
190 | int evtchn = evtchn_from_irq(irq); | ||
191 | |||
192 | if (VALID_EVTCHN(evtchn)) | ||
193 | notify_remote_via_evtchn(evtchn); | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(notify_remote_via_irq); | ||
196 | |||
197 | static void mask_evtchn(int port) | ||
198 | { | ||
199 | struct shared_info *s = HYPERVISOR_shared_info; | ||
200 | sync_set_bit(port, &s->evtchn_mask[0]); | ||
201 | } | ||
202 | |||
203 | static void unmask_evtchn(int port) | ||
204 | { | ||
205 | struct shared_info *s = HYPERVISOR_shared_info; | ||
206 | unsigned int cpu = get_cpu(); | ||
207 | |||
208 | BUG_ON(!irqs_disabled()); | ||
209 | |||
210 | /* Slow path (hypercall) if this is a non-local port. */ | ||
211 | if (unlikely(cpu != cpu_from_evtchn(port))) { | ||
212 | struct evtchn_unmask unmask = { .port = port }; | ||
213 | (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); | ||
214 | } else { | ||
215 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
216 | |||
217 | sync_clear_bit(port, &s->evtchn_mask[0]); | ||
218 | |||
219 | /* | ||
220 | * The following is basically the equivalent of | ||
221 | * 'hw_resend_irq'. Just like a real IO-APIC we 'lose | ||
222 | * the interrupt edge' if the channel is masked. | ||
223 | */ | ||
224 | if (sync_test_bit(port, &s->evtchn_pending[0]) && | ||
225 | !sync_test_and_set_bit(port / BITS_PER_LONG, | ||
226 | &vcpu_info->evtchn_pending_sel)) | ||
227 | vcpu_info->evtchn_upcall_pending = 1; | ||
228 | } | ||
229 | |||
230 | put_cpu(); | ||
231 | } | ||
232 | |||
233 | static int find_unbound_irq(void) | ||
234 | { | ||
235 | int irq; | ||
236 | |||
237 | /* Only allocate from dynirq range */ | ||
238 | for (irq = 0; irq < NR_IRQS; irq++) | ||
239 | if (irq_bindcount[irq] == 0) | ||
240 | break; | ||
241 | |||
242 | if (irq == NR_IRQS) | ||
243 | panic("No available IRQ to bind to: increase NR_IRQS!\n"); | ||
244 | |||
245 | return irq; | ||
246 | } | ||
247 | |||
248 | int bind_evtchn_to_irq(unsigned int evtchn) | ||
249 | { | ||
250 | int irq; | ||
251 | |||
252 | spin_lock(&irq_mapping_update_lock); | ||
253 | |||
254 | irq = evtchn_to_irq[evtchn]; | ||
255 | |||
256 | if (irq == -1) { | ||
257 | irq = find_unbound_irq(); | ||
258 | |||
259 | dynamic_irq_init(irq); | ||
260 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
261 | handle_level_irq, "event"); | ||
262 | |||
263 | evtchn_to_irq[evtchn] = irq; | ||
264 | irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); | ||
265 | } | ||
266 | |||
267 | irq_bindcount[irq]++; | ||
268 | |||
269 | spin_unlock(&irq_mapping_update_lock); | ||
270 | |||
271 | return irq; | ||
272 | } | ||
273 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); | ||
274 | |||
275 | static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) | ||
276 | { | ||
277 | struct evtchn_bind_ipi bind_ipi; | ||
278 | int evtchn, irq; | ||
279 | |||
280 | spin_lock(&irq_mapping_update_lock); | ||
281 | |||
282 | irq = per_cpu(ipi_to_irq, cpu)[ipi]; | ||
283 | if (irq == -1) { | ||
284 | irq = find_unbound_irq(); | ||
285 | if (irq < 0) | ||
286 | goto out; | ||
287 | |||
288 | dynamic_irq_init(irq); | ||
289 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
290 | handle_level_irq, "ipi"); | ||
291 | |||
292 | bind_ipi.vcpu = cpu; | ||
293 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, | ||
294 | &bind_ipi) != 0) | ||
295 | BUG(); | ||
296 | evtchn = bind_ipi.port; | ||
297 | |||
298 | evtchn_to_irq[evtchn] = irq; | ||
299 | irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); | ||
300 | |||
301 | per_cpu(ipi_to_irq, cpu)[ipi] = irq; | ||
302 | |||
303 | bind_evtchn_to_cpu(evtchn, cpu); | ||
304 | } | ||
305 | |||
306 | irq_bindcount[irq]++; | ||
307 | |||
308 | out: | ||
309 | spin_unlock(&irq_mapping_update_lock); | ||
310 | return irq; | ||
311 | } | ||
312 | |||
313 | |||
314 | static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) | ||
315 | { | ||
316 | struct evtchn_bind_virq bind_virq; | ||
317 | int evtchn, irq; | ||
318 | |||
319 | spin_lock(&irq_mapping_update_lock); | ||
320 | |||
321 | irq = per_cpu(virq_to_irq, cpu)[virq]; | ||
322 | |||
323 | if (irq == -1) { | ||
324 | bind_virq.virq = virq; | ||
325 | bind_virq.vcpu = cpu; | ||
326 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, | ||
327 | &bind_virq) != 0) | ||
328 | BUG(); | ||
329 | evtchn = bind_virq.port; | ||
330 | |||
331 | irq = find_unbound_irq(); | ||
332 | |||
333 | dynamic_irq_init(irq); | ||
334 | set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, | ||
335 | handle_level_irq, "virq"); | ||
336 | |||
337 | evtchn_to_irq[evtchn] = irq; | ||
338 | irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); | ||
339 | |||
340 | per_cpu(virq_to_irq, cpu)[virq] = irq; | ||
341 | |||
342 | bind_evtchn_to_cpu(evtchn, cpu); | ||
343 | } | ||
344 | |||
345 | irq_bindcount[irq]++; | ||
346 | |||
347 | spin_unlock(&irq_mapping_update_lock); | ||
348 | |||
349 | return irq; | ||
350 | } | ||
351 | |||
352 | static void unbind_from_irq(unsigned int irq) | ||
353 | { | ||
354 | struct evtchn_close close; | ||
355 | int evtchn = evtchn_from_irq(irq); | ||
356 | |||
357 | spin_lock(&irq_mapping_update_lock); | ||
358 | |||
359 | if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { | ||
360 | close.port = evtchn; | ||
361 | if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) | ||
362 | BUG(); | ||
363 | |||
364 | switch (type_from_irq(irq)) { | ||
365 | case IRQT_VIRQ: | ||
366 | per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) | ||
367 | [index_from_irq(irq)] = -1; | ||
368 | break; | ||
369 | default: | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | /* Closed ports are implicitly re-bound to VCPU0. */ | ||
374 | bind_evtchn_to_cpu(evtchn, 0); | ||
375 | |||
376 | evtchn_to_irq[evtchn] = -1; | ||
377 | irq_info[irq] = IRQ_UNBOUND; | ||
378 | |||
379 | dynamic_irq_init(irq); | ||
380 | } | ||
381 | |||
382 | spin_unlock(&irq_mapping_update_lock); | ||
383 | } | ||
384 | |||
385 | int bind_evtchn_to_irqhandler(unsigned int evtchn, | ||
386 | irq_handler_t handler, | ||
387 | unsigned long irqflags, | ||
388 | const char *devname, void *dev_id) | ||
389 | { | ||
390 | unsigned int irq; | ||
391 | int retval; | ||
392 | |||
393 | irq = bind_evtchn_to_irq(evtchn); | ||
394 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
395 | if (retval != 0) { | ||
396 | unbind_from_irq(irq); | ||
397 | return retval; | ||
398 | } | ||
399 | |||
400 | return irq; | ||
401 | } | ||
402 | EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); | ||
403 | |||
404 | int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, | ||
405 | irq_handler_t handler, | ||
406 | unsigned long irqflags, const char *devname, void *dev_id) | ||
407 | { | ||
408 | unsigned int irq; | ||
409 | int retval; | ||
410 | |||
411 | irq = bind_virq_to_irq(virq, cpu); | ||
412 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
413 | if (retval != 0) { | ||
414 | unbind_from_irq(irq); | ||
415 | return retval; | ||
416 | } | ||
417 | |||
418 | return irq; | ||
419 | } | ||
420 | EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); | ||
421 | |||
422 | int bind_ipi_to_irqhandler(enum ipi_vector ipi, | ||
423 | unsigned int cpu, | ||
424 | irq_handler_t handler, | ||
425 | unsigned long irqflags, | ||
426 | const char *devname, | ||
427 | void *dev_id) | ||
428 | { | ||
429 | int irq, retval; | ||
430 | |||
431 | irq = bind_ipi_to_irq(ipi, cpu); | ||
432 | if (irq < 0) | ||
433 | return irq; | ||
434 | |||
435 | retval = request_irq(irq, handler, irqflags, devname, dev_id); | ||
436 | if (retval != 0) { | ||
437 | unbind_from_irq(irq); | ||
438 | return retval; | ||
439 | } | ||
440 | |||
441 | return irq; | ||
442 | } | ||
443 | |||
444 | void unbind_from_irqhandler(unsigned int irq, void *dev_id) | ||
445 | { | ||
446 | free_irq(irq, dev_id); | ||
447 | unbind_from_irq(irq); | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(unbind_from_irqhandler); | ||
450 | |||
451 | void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) | ||
452 | { | ||
453 | int irq = per_cpu(ipi_to_irq, cpu)[vector]; | ||
454 | BUG_ON(irq < 0); | ||
455 | notify_remote_via_irq(irq); | ||
456 | } | ||
457 | |||
458 | |||
459 | /* | ||
460 | * Search the CPUs pending events bitmasks. For each one found, map | ||
461 | * the event number to an irq, and feed it into do_IRQ() for | ||
462 | * handling. | ||
463 | * | ||
464 | * Xen uses a two-level bitmap to speed searching. The first level is | ||
465 | * a bitset of words which contain pending event bits. The second | ||
466 | * level is a bitset of pending events themselves. | ||
467 | */ | ||
468 | void xen_evtchn_do_upcall(struct pt_regs *regs) | ||
469 | { | ||
470 | int cpu = get_cpu(); | ||
471 | struct shared_info *s = HYPERVISOR_shared_info; | ||
472 | struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); | ||
473 | unsigned long pending_words; | ||
474 | |||
475 | vcpu_info->evtchn_upcall_pending = 0; | ||
476 | |||
477 | /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ | ||
478 | pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); | ||
479 | while (pending_words != 0) { | ||
480 | unsigned long pending_bits; | ||
481 | int word_idx = __ffs(pending_words); | ||
482 | pending_words &= ~(1UL << word_idx); | ||
483 | |||
484 | while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { | ||
485 | int bit_idx = __ffs(pending_bits); | ||
486 | int port = (word_idx * BITS_PER_LONG) + bit_idx; | ||
487 | int irq = evtchn_to_irq[port]; | ||
488 | |||
489 | if (irq != -1) { | ||
490 | regs->orig_ax = ~irq; | ||
491 | do_IRQ(regs); | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | put_cpu(); | ||
497 | } | ||
498 | |||
499 | /* Rebind an evtchn so that it gets delivered to a specific cpu */ | ||
500 | static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) | ||
501 | { | ||
502 | struct evtchn_bind_vcpu bind_vcpu; | ||
503 | int evtchn = evtchn_from_irq(irq); | ||
504 | |||
505 | if (!VALID_EVTCHN(evtchn)) | ||
506 | return; | ||
507 | |||
508 | /* Send future instances of this interrupt to other vcpu. */ | ||
509 | bind_vcpu.port = evtchn; | ||
510 | bind_vcpu.vcpu = tcpu; | ||
511 | |||
512 | /* | ||
513 | * If this fails, it usually just indicates that we're dealing with a | ||
514 | * virq or IPI channel, which don't actually need to be rebound. Ignore | ||
515 | * it, but don't do the xenlinux-level rebind in that case. | ||
516 | */ | ||
517 | if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) | ||
518 | bind_evtchn_to_cpu(evtchn, tcpu); | ||
519 | } | ||
520 | |||
521 | |||
522 | static void set_affinity_irq(unsigned irq, cpumask_t dest) | ||
523 | { | ||
524 | unsigned tcpu = first_cpu(dest); | ||
525 | rebind_irq_to_cpu(irq, tcpu); | ||
526 | } | ||
527 | |||
528 | static void enable_dynirq(unsigned int irq) | ||
529 | { | ||
530 | int evtchn = evtchn_from_irq(irq); | ||
531 | |||
532 | if (VALID_EVTCHN(evtchn)) | ||
533 | unmask_evtchn(evtchn); | ||
534 | } | ||
535 | |||
536 | static void disable_dynirq(unsigned int irq) | ||
537 | { | ||
538 | int evtchn = evtchn_from_irq(irq); | ||
539 | |||
540 | if (VALID_EVTCHN(evtchn)) | ||
541 | mask_evtchn(evtchn); | ||
542 | } | ||
543 | |||
544 | static void ack_dynirq(unsigned int irq) | ||
545 | { | ||
546 | int evtchn = evtchn_from_irq(irq); | ||
547 | |||
548 | move_native_irq(irq); | ||
549 | |||
550 | if (VALID_EVTCHN(evtchn)) | ||
551 | clear_evtchn(evtchn); | ||
552 | } | ||
553 | |||
554 | static int retrigger_dynirq(unsigned int irq) | ||
555 | { | ||
556 | int evtchn = evtchn_from_irq(irq); | ||
557 | int ret = 0; | ||
558 | |||
559 | if (VALID_EVTCHN(evtchn)) { | ||
560 | set_evtchn(evtchn); | ||
561 | ret = 1; | ||
562 | } | ||
563 | |||
564 | return ret; | ||
565 | } | ||
566 | |||
567 | static struct irq_chip xen_dynamic_chip __read_mostly = { | ||
568 | .name = "xen-dyn", | ||
569 | .mask = disable_dynirq, | ||
570 | .unmask = enable_dynirq, | ||
571 | .ack = ack_dynirq, | ||
572 | .set_affinity = set_affinity_irq, | ||
573 | .retrigger = retrigger_dynirq, | ||
574 | }; | ||
575 | |||
576 | void __init xen_init_IRQ(void) | ||
577 | { | ||
578 | int i; | ||
579 | |||
580 | init_evtchn_cpu_bindings(); | ||
581 | |||
582 | /* No event channels are 'live' right now. */ | ||
583 | for (i = 0; i < NR_EVENT_CHANNELS; i++) | ||
584 | mask_evtchn(i); | ||
585 | |||
586 | /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ | ||
587 | for (i = 0; i < NR_IRQS; i++) | ||
588 | irq_bindcount[i] = 0; | ||
589 | |||
590 | irq_ctx_init(smp_processor_id()); | ||
591 | } | ||
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c deleted file mode 100644 index 0707714e40d6..000000000000 --- a/arch/x86/xen/features.c +++ /dev/null | |||
@@ -1,29 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * features.c | ||
3 | * | ||
4 | * Xen feature flags. | ||
5 | * | ||
6 | * Copyright (c) 2006, Ian Campbell, XenSource Inc. | ||
7 | */ | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <asm/xen/hypervisor.h> | ||
12 | #include <xen/features.h> | ||
13 | |||
14 | u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; | ||
15 | EXPORT_SYMBOL_GPL(xen_features); | ||
16 | |||
17 | void xen_setup_features(void) | ||
18 | { | ||
19 | struct xen_feature_info fi; | ||
20 | int i, j; | ||
21 | |||
22 | for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { | ||
23 | fi.submap_idx = i; | ||
24 | if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) | ||
25 | break; | ||
26 | for (j = 0; j < 32; j++) | ||
27 | xen_features[i * 32 + j] = !!(fi.submap & 1<<j); | ||
28 | } | ||
29 | } | ||
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c new file mode 100644 index 000000000000..49ba9b5224d1 --- /dev/null +++ b/arch/x86/xen/grant-table.c | |||
@@ -0,0 +1,91 @@ | |||
1 | /****************************************************************************** | ||
2 | * grant_table.c | ||
3 | * x86 specific part | ||
4 | * | ||
5 | * Granting foreign access to our memory reservation. | ||
6 | * | ||
7 | * Copyright (c) 2005-2006, Christopher Clark | ||
8 | * Copyright (c) 2004-2005, K A Fraser | ||
9 | * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> | ||
10 | * VA Linux Systems Japan. Split out x86 specific part. | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License version 2 | ||
14 | * as published by the Free Software Foundation; or, when distributed | ||
15 | * separately from the Linux kernel or incorporated into other | ||
16 | * software packages, subject to the following license: | ||
17 | * | ||
18 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
19 | * of this source file (the "Software"), to deal in the Software without | ||
20 | * restriction, including without limitation the rights to use, copy, modify, | ||
21 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
22 | * and to permit persons to whom the Software is furnished to do so, subject to | ||
23 | * the following conditions: | ||
24 | * | ||
25 | * The above copyright notice and this permission notice shall be included in | ||
26 | * all copies or substantial portions of the Software. | ||
27 | * | ||
28 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
29 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
30 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
31 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
32 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
33 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
34 | * IN THE SOFTWARE. | ||
35 | */ | ||
36 | |||
37 | #include <linux/sched.h> | ||
38 | #include <linux/mm.h> | ||
39 | #include <linux/vmalloc.h> | ||
40 | |||
41 | #include <xen/interface/xen.h> | ||
42 | #include <xen/page.h> | ||
43 | #include <xen/grant_table.h> | ||
44 | |||
45 | #include <asm/pgtable.h> | ||
46 | |||
47 | static int map_pte_fn(pte_t *pte, struct page *pmd_page, | ||
48 | unsigned long addr, void *data) | ||
49 | { | ||
50 | unsigned long **frames = (unsigned long **)data; | ||
51 | |||
52 | set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); | ||
53 | (*frames)++; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, | ||
58 | unsigned long addr, void *data) | ||
59 | { | ||
60 | |||
61 | set_pte_at(&init_mm, addr, pte, __pte(0)); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, | ||
66 | unsigned long max_nr_gframes, | ||
67 | struct grant_entry **__shared) | ||
68 | { | ||
69 | int rc; | ||
70 | struct grant_entry *shared = *__shared; | ||
71 | |||
72 | if (shared == NULL) { | ||
73 | struct vm_struct *area = | ||
74 | xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); | ||
75 | BUG_ON(area == NULL); | ||
76 | shared = area->addr; | ||
77 | *__shared = shared; | ||
78 | } | ||
79 | |||
80 | rc = apply_to_page_range(&init_mm, (unsigned long)shared, | ||
81 | PAGE_SIZE * nr_gframes, | ||
82 | map_pte_fn, &frames); | ||
83 | return rc; | ||
84 | } | ||
85 | |||
86 | void arch_gnttab_unmap_shared(struct grant_entry *shared, | ||
87 | unsigned long nr_gframes) | ||
88 | { | ||
89 | apply_to_page_range(&init_mm, (unsigned long)shared, | ||
90 | PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); | ||
91 | } | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2a054ef2a3da..6cbcf65609ad 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | |||
156 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | 156 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, |
157 | pte_t *ptep, pte_t pteval) | 157 | pte_t *ptep, pte_t pteval) |
158 | { | 158 | { |
159 | /* updates to init_mm may be done without lock */ | ||
160 | if (mm == &init_mm) | ||
161 | preempt_disable(); | ||
162 | |||
159 | if (mm == current->mm || mm == &init_mm) { | 163 | if (mm == current->mm || mm == &init_mm) { |
160 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 164 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
161 | struct multicall_space mcs; | 165 | struct multicall_space mcs; |
@@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
163 | 167 | ||
164 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 168 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); |
165 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 169 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
166 | return; | 170 | goto out; |
167 | } else | 171 | } else |
168 | if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) | 172 | if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) |
169 | return; | 173 | goto out; |
170 | } | 174 | } |
171 | xen_set_pte(ptep, pteval); | 175 | xen_set_pte(ptep, pteval); |
176 | |||
177 | out: | ||
178 | if (mm == &init_mm) | ||
179 | preempt_enable(); | ||
180 | } | ||
181 | |||
182 | pteval_t xen_pte_val(pte_t pte) | ||
183 | { | ||
184 | pteval_t ret = pte.pte; | ||
185 | |||
186 | if (ret & _PAGE_PRESENT) | ||
187 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | ||
188 | |||
189 | return ret; | ||
190 | } | ||
191 | |||
192 | pgdval_t xen_pgd_val(pgd_t pgd) | ||
193 | { | ||
194 | pgdval_t ret = pgd.pgd; | ||
195 | if (ret & _PAGE_PRESENT) | ||
196 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | ||
197 | return ret; | ||
198 | } | ||
199 | |||
200 | pte_t xen_make_pte(pteval_t pte) | ||
201 | { | ||
202 | if (pte & _PAGE_PRESENT) { | ||
203 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
204 | pte &= ~(_PAGE_PCD | _PAGE_PWT); | ||
205 | } | ||
206 | |||
207 | return (pte_t){ .pte = pte }; | ||
172 | } | 208 | } |
173 | 209 | ||
210 | pgd_t xen_make_pgd(pgdval_t pgd) | ||
211 | { | ||
212 | if (pgd & _PAGE_PRESENT) | ||
213 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
214 | |||
215 | return (pgd_t){ pgd }; | ||
216 | } | ||
217 | |||
218 | pmdval_t xen_pmd_val(pmd_t pmd) | ||
219 | { | ||
220 | pmdval_t ret = native_pmd_val(pmd); | ||
221 | if (ret & _PAGE_PRESENT) | ||
222 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | ||
223 | return ret; | ||
224 | } | ||
174 | #ifdef CONFIG_X86_PAE | 225 | #ifdef CONFIG_X86_PAE |
175 | void xen_set_pud(pud_t *ptr, pud_t val) | 226 | void xen_set_pud(pud_t *ptr, pud_t val) |
176 | { | 227 | { |
@@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp) | |||
214 | xen_set_pmd(pmdp, __pmd(0)); | 265 | xen_set_pmd(pmdp, __pmd(0)); |
215 | } | 266 | } |
216 | 267 | ||
217 | unsigned long long xen_pte_val(pte_t pte) | 268 | pmd_t xen_make_pmd(pmdval_t pmd) |
218 | { | 269 | { |
219 | unsigned long long ret = 0; | 270 | if (pmd & _PAGE_PRESENT) |
220 | |||
221 | if (pte.pte_low) { | ||
222 | ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; | ||
223 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
224 | } | ||
225 | |||
226 | return ret; | ||
227 | } | ||
228 | |||
229 | unsigned long long xen_pmd_val(pmd_t pmd) | ||
230 | { | ||
231 | unsigned long long ret = pmd.pmd; | ||
232 | if (ret) | ||
233 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
234 | return ret; | ||
235 | } | ||
236 | |||
237 | unsigned long long xen_pgd_val(pgd_t pgd) | ||
238 | { | ||
239 | unsigned long long ret = pgd.pgd; | ||
240 | if (ret) | ||
241 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
242 | return ret; | ||
243 | } | ||
244 | |||
245 | pte_t xen_make_pte(unsigned long long pte) | ||
246 | { | ||
247 | if (pte & _PAGE_PRESENT) { | ||
248 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
249 | pte &= ~(_PAGE_PCD | _PAGE_PWT); | ||
250 | } | ||
251 | |||
252 | return (pte_t){ .pte = pte }; | ||
253 | } | ||
254 | |||
255 | pmd_t xen_make_pmd(unsigned long long pmd) | ||
256 | { | ||
257 | if (pmd & 1) | ||
258 | pmd = phys_to_machine(XPADDR(pmd)).maddr; | 271 | pmd = phys_to_machine(XPADDR(pmd)).maddr; |
259 | 272 | ||
260 | return (pmd_t){ pmd }; | 273 | return native_make_pmd(pmd); |
261 | } | ||
262 | |||
263 | pgd_t xen_make_pgd(unsigned long long pgd) | ||
264 | { | ||
265 | if (pgd & _PAGE_PRESENT) | ||
266 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
267 | |||
268 | return (pgd_t){ pgd }; | ||
269 | } | 274 | } |
270 | #else /* !PAE */ | 275 | #else /* !PAE */ |
271 | void xen_set_pte(pte_t *ptep, pte_t pte) | 276 | void xen_set_pte(pte_t *ptep, pte_t pte) |
272 | { | 277 | { |
273 | *ptep = pte; | 278 | *ptep = pte; |
274 | } | 279 | } |
275 | |||
276 | unsigned long xen_pte_val(pte_t pte) | ||
277 | { | ||
278 | unsigned long ret = pte.pte_low; | ||
279 | |||
280 | if (ret & _PAGE_PRESENT) | ||
281 | ret = machine_to_phys(XMADDR(ret)).paddr; | ||
282 | |||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | unsigned long xen_pgd_val(pgd_t pgd) | ||
287 | { | ||
288 | unsigned long ret = pgd.pgd; | ||
289 | if (ret) | ||
290 | ret = machine_to_phys(XMADDR(ret)).paddr | 1; | ||
291 | return ret; | ||
292 | } | ||
293 | |||
294 | pte_t xen_make_pte(unsigned long pte) | ||
295 | { | ||
296 | if (pte & _PAGE_PRESENT) { | ||
297 | pte = phys_to_machine(XPADDR(pte)).maddr; | ||
298 | pte &= ~(_PAGE_PCD | _PAGE_PWT); | ||
299 | } | ||
300 | |||
301 | return (pte_t){ pte }; | ||
302 | } | ||
303 | |||
304 | pgd_t xen_make_pgd(unsigned long pgd) | ||
305 | { | ||
306 | if (pgd & _PAGE_PRESENT) | ||
307 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | ||
308 | |||
309 | return (pgd_t){ pgd }; | ||
310 | } | ||
311 | #endif /* CONFIG_X86_PAE */ | 280 | #endif /* CONFIG_X86_PAE */ |
312 | 281 | ||
313 | /* | 282 | /* |
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2341492bf7a0..82517e4a752a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <asm/xen/hypervisor.h> | 16 | #include <asm/xen/hypervisor.h> |
17 | #include <asm/xen/hypercall.h> | 17 | #include <asm/xen/hypercall.h> |
18 | 18 | ||
19 | #include <xen/interface/callback.h> | ||
19 | #include <xen/interface/physdev.h> | 20 | #include <xen/interface/physdev.h> |
20 | #include <xen/features.h> | 21 | #include <xen/features.h> |
21 | 22 | ||
@@ -68,6 +69,24 @@ static void __init fiddle_vdso(void) | |||
68 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; | 69 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; |
69 | } | 70 | } |
70 | 71 | ||
72 | void xen_enable_sysenter(void) | ||
73 | { | ||
74 | int cpu = smp_processor_id(); | ||
75 | extern void xen_sysenter_target(void); | ||
76 | /* Mask events on entry, even though they get enabled immediately */ | ||
77 | static struct callback_register sysenter = { | ||
78 | .type = CALLBACKTYPE_sysenter, | ||
79 | .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, | ||
80 | .flags = CALLBACKF_mask_events, | ||
81 | }; | ||
82 | |||
83 | if (!boot_cpu_has(X86_FEATURE_SEP) || | ||
84 | HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { | ||
85 | clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); | ||
86 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); | ||
87 | } | ||
88 | } | ||
89 | |||
71 | void __init xen_arch_setup(void) | 90 | void __init xen_arch_setup(void) |
72 | { | 91 | { |
73 | struct physdev_set_iopl set_iopl; | 92 | struct physdev_set_iopl set_iopl; |
@@ -82,6 +101,8 @@ void __init xen_arch_setup(void) | |||
82 | HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, | 101 | HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, |
83 | __KERNEL_CS, (unsigned long)xen_failsafe_callback); | 102 | __KERNEL_CS, (unsigned long)xen_failsafe_callback); |
84 | 103 | ||
104 | xen_enable_sysenter(); | ||
105 | |||
85 | set_iopl.iopl = 1; | 106 | set_iopl.iopl = 1; |
86 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | 107 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); |
87 | if (rc != 0) | 108 | if (rc != 0) |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e340ff92f6b6..92dd3dbf3ffb 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -36,8 +36,9 @@ | |||
36 | #include "mmu.h" | 36 | #include "mmu.h" |
37 | 37 | ||
38 | static cpumask_t xen_cpu_initialized_map; | 38 | static cpumask_t xen_cpu_initialized_map; |
39 | static DEFINE_PER_CPU(int, resched_irq); | 39 | static DEFINE_PER_CPU(int, resched_irq) = -1; |
40 | static DEFINE_PER_CPU(int, callfunc_irq); | 40 | static DEFINE_PER_CPU(int, callfunc_irq) = -1; |
41 | static DEFINE_PER_CPU(int, debug_irq) = -1; | ||
41 | 42 | ||
42 | /* | 43 | /* |
43 | * Structure and data for smp_call_function(). This is designed to minimise | 44 | * Structure and data for smp_call_function(). This is designed to minimise |
@@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void) | |||
72 | int cpu = smp_processor_id(); | 73 | int cpu = smp_processor_id(); |
73 | 74 | ||
74 | cpu_init(); | 75 | cpu_init(); |
76 | xen_enable_sysenter(); | ||
75 | 77 | ||
76 | preempt_disable(); | 78 | preempt_disable(); |
77 | per_cpu(cpu_state, cpu) = CPU_ONLINE; | 79 | per_cpu(cpu_state, cpu) = CPU_ONLINE; |
@@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void) | |||
88 | static int xen_smp_intr_init(unsigned int cpu) | 90 | static int xen_smp_intr_init(unsigned int cpu) |
89 | { | 91 | { |
90 | int rc; | 92 | int rc; |
91 | const char *resched_name, *callfunc_name; | 93 | const char *resched_name, *callfunc_name, *debug_name; |
92 | |||
93 | per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; | ||
94 | 94 | ||
95 | resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); | 95 | resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); |
96 | rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, | 96 | rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, |
@@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu) | |||
114 | goto fail; | 114 | goto fail; |
115 | per_cpu(callfunc_irq, cpu) = rc; | 115 | per_cpu(callfunc_irq, cpu) = rc; |
116 | 116 | ||
117 | debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); | ||
118 | rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, | ||
119 | IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, | ||
120 | debug_name, NULL); | ||
121 | if (rc < 0) | ||
122 | goto fail; | ||
123 | per_cpu(debug_irq, cpu) = rc; | ||
124 | |||
117 | return 0; | 125 | return 0; |
118 | 126 | ||
119 | fail: | 127 | fail: |
@@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu) | |||
121 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | 129 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); |
122 | if (per_cpu(callfunc_irq, cpu) >= 0) | 130 | if (per_cpu(callfunc_irq, cpu) >= 0) |
123 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | 131 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); |
132 | if (per_cpu(debug_irq, cpu) >= 0) | ||
133 | unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); | ||
124 | return rc; | 134 | return rc; |
125 | } | 135 | } |
126 | 136 | ||
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index fe161ed4b01e..2497a30f41de 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S | |||
@@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct) | |||
108 | RELOC(xen_restore_fl_direct, 2b+1) | 108 | RELOC(xen_restore_fl_direct, 2b+1) |
109 | 109 | ||
110 | /* | 110 | /* |
111 | We can't use sysexit directly, because we're not running in ring0. | ||
112 | But we can easily fake it up using iret. Assuming xen_sysexit | ||
113 | is jumped to with a standard stack frame, we can just strip it | ||
114 | back to a standard iret frame and use iret. | ||
115 | */ | ||
116 | ENTRY(xen_sysexit) | ||
117 | movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ | ||
118 | orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) | ||
119 | lea PT_EIP(%esp), %esp | ||
120 | |||
121 | jmp xen_iret | ||
122 | ENDPROC(xen_sysexit) | ||
123 | |||
124 | /* | ||
111 | This is run where a normal iret would be run, with the same stack setup: | 125 | This is run where a normal iret would be run, with the same stack setup: |
112 | 8: eflags | 126 | 8: eflags |
113 | 4: cs | 127 | 4: cs |
@@ -184,8 +198,12 @@ iret_restore_end: | |||
184 | region is OK. */ | 198 | region is OK. */ |
185 | je xen_hypervisor_callback | 199 | je xen_hypervisor_callback |
186 | 200 | ||
187 | iret | 201 | 1: iret |
188 | xen_iret_end_crit: | 202 | xen_iret_end_crit: |
203 | .section __ex_table,"a" | ||
204 | .align 4 | ||
205 | .long 1b,iret_exc | ||
206 | .previous | ||
189 | 207 | ||
190 | hyper_iret: | 208 | hyper_iret: |
191 | /* put this out of line since its very rarely used */ | 209 | /* put this out of line since its very rarely used */ |
@@ -219,9 +237,7 @@ hyper_iret: | |||
219 | ds } SAVE_ALL state | 237 | ds } SAVE_ALL state |
220 | eax } | 238 | eax } |
221 | : : | 239 | : : |
222 | ebx } | 240 | ebx }<- esp |
223 | ---------------- | ||
224 | return addr <- esp | ||
225 | ---------------- | 241 | ---------------- |
226 | 242 | ||
227 | In order to deliver the nested exception properly, we need to shift | 243 | In order to deliver the nested exception properly, we need to shift |
@@ -236,10 +252,8 @@ hyper_iret: | |||
236 | it's usermode state which we eventually need to restore. | 252 | it's usermode state which we eventually need to restore. |
237 | */ | 253 | */ |
238 | ENTRY(xen_iret_crit_fixup) | 254 | ENTRY(xen_iret_crit_fixup) |
239 | /* offsets +4 for return address */ | ||
240 | |||
241 | /* | 255 | /* |
242 | Paranoia: Make sure we're really coming from userspace. | 256 | Paranoia: Make sure we're really coming from kernel space. |
243 | One could imagine a case where userspace jumps into the | 257 | One could imagine a case where userspace jumps into the |
244 | critical range address, but just before the CPU delivers a GP, | 258 | critical range address, but just before the CPU delivers a GP, |
245 | it decides to deliver an interrupt instead. Unlikely? | 259 | it decides to deliver an interrupt instead. Unlikely? |
@@ -248,32 +262,32 @@ ENTRY(xen_iret_crit_fixup) | |||
248 | jump instruction itself, not the destination, but some virtual | 262 | jump instruction itself, not the destination, but some virtual |
249 | environments get this wrong. | 263 | environments get this wrong. |
250 | */ | 264 | */ |
251 | movl PT_CS+4(%esp), %ecx | 265 | movl PT_CS(%esp), %ecx |
252 | andl $SEGMENT_RPL_MASK, %ecx | 266 | andl $SEGMENT_RPL_MASK, %ecx |
253 | cmpl $USER_RPL, %ecx | 267 | cmpl $USER_RPL, %ecx |
254 | je 2f | 268 | je 2f |
255 | 269 | ||
256 | lea PT_ORIG_EAX+4(%esp), %esi | 270 | lea PT_ORIG_EAX(%esp), %esi |
257 | lea PT_EFLAGS+4(%esp), %edi | 271 | lea PT_EFLAGS(%esp), %edi |
258 | 272 | ||
259 | /* If eip is before iret_restore_end then stack | 273 | /* If eip is before iret_restore_end then stack |
260 | hasn't been restored yet. */ | 274 | hasn't been restored yet. */ |
261 | cmp $iret_restore_end, %eax | 275 | cmp $iret_restore_end, %eax |
262 | jae 1f | 276 | jae 1f |
263 | 277 | ||
264 | movl 0+4(%edi),%eax /* copy EAX */ | 278 | movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ |
265 | movl %eax, PT_EAX+4(%esp) | 279 | movl %eax, PT_EAX(%esp) |
266 | 280 | ||
267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ | 281 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ |
268 | 282 | ||
269 | /* set up the copy */ | 283 | /* set up the copy */ |
270 | 1: std | 284 | 1: std |
271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ | 285 | mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ |
272 | rep movsl | 286 | rep movsl |
273 | cld | 287 | cld |
274 | 288 | ||
275 | lea 4(%edi),%esp /* point esp to new frame */ | 289 | lea 4(%edi),%esp /* point esp to new frame */ |
276 | 2: ret | 290 | 2: jmp xen_do_upcall |
277 | 291 | ||
278 | 292 | ||
279 | /* | 293 | /* |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 956a491ea998..f1063ae08037 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -2,6 +2,8 @@ | |||
2 | #define XEN_OPS_H | 2 | #define XEN_OPS_H |
3 | 3 | ||
4 | #include <linux/init.h> | 4 | #include <linux/init.h> |
5 | #include <linux/irqreturn.h> | ||
6 | #include <xen/xen-ops.h> | ||
5 | 7 | ||
6 | /* These are code, but not functions. Defined in entry.S */ | 8 | /* These are code, but not functions. Defined in entry.S */ |
7 | extern const char xen_hypervisor_callback[]; | 9 | extern const char xen_hypervisor_callback[]; |
@@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[]; | |||
9 | 11 | ||
10 | void xen_copy_trap_info(struct trap_info *traps); | 12 | void xen_copy_trap_info(struct trap_info *traps); |
11 | 13 | ||
12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); | ||
13 | DECLARE_PER_CPU(unsigned long, xen_cr3); | 14 | DECLARE_PER_CPU(unsigned long, xen_cr3); |
14 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); | 15 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); |
15 | 16 | ||
@@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info; | |||
19 | char * __init xen_memory_setup(void); | 20 | char * __init xen_memory_setup(void); |
20 | void __init xen_arch_setup(void); | 21 | void __init xen_arch_setup(void); |
21 | void __init xen_init_IRQ(void); | 22 | void __init xen_init_IRQ(void); |
23 | void xen_enable_sysenter(void); | ||
22 | 24 | ||
23 | void xen_setup_timer(int cpu); | 25 | void xen_setup_timer(int cpu); |
24 | void xen_setup_cpu_clockevents(void); | 26 | void xen_setup_cpu_clockevents(void); |
@@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void); | |||
28 | int xen_set_wallclock(unsigned long time); | 30 | int xen_set_wallclock(unsigned long time); |
29 | unsigned long long xen_sched_clock(void); | 31 | unsigned long long xen_sched_clock(void); |
30 | 32 | ||
33 | irqreturn_t xen_debug_interrupt(int irq, void *dev_id); | ||
34 | |||
31 | bool xen_vcpu_stolen(int vcpu); | 35 | bool xen_vcpu_stolen(int vcpu); |
32 | 36 | ||
33 | void xen_mark_init_mm_pinned(void); | 37 | void xen_mark_init_mm_pinned(void); |
@@ -64,4 +68,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); | |||
64 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); | 68 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); |
65 | 69 | ||
66 | void xen_iret(void); | 70 | void xen_iret(void); |
71 | void xen_sysexit(void); | ||
72 | |||
67 | #endif /* XEN_OPS_H */ | 73 | #endif /* XEN_OPS_H */ |