aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-04-25 15:32:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-25 15:32:10 -0400
commit4b7227ca321ccf447cdc04538687c895db8b77f5 (patch)
tree72712127fc56aa2579e8a1508998bcabf6bd6c60 /arch/x86
parent5dae61b80564a5583ff4b56e357bdbc733fddb76 (diff)
parent1775826ceec51187aa868406585799b7e76ffa7d (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-xen-next
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-xen-next: (52 commits) xen: add balloon driver xen: allow compilation with non-flat memory xen: fold xen_sysexit into xen_iret xen: allow set_pte_at on init_mm to be lockless xen: disable preemption during tlb flush xen pvfb: Para-virtual framebuffer, keyboard and pointer driver xen: Add compatibility aliases for frontend drivers xen: Module autoprobing support for frontend drivers xen blkfront: Delay wait for block devices until after the disk is added xen/blkfront: use bdget_disk xen: Make xen-blkfront write its protocol ABI to xenstore xen: import arch generic part of xencomm xen: make grant table arch portable xen: replace callers of alloc_vm_area()/free_vm_area() with xen_ prefixed one xen: make include/xen/page.h portable moving those definitions under asm dir xen: add resend_irq_on_evtchn() definition into events.c Xen: make events.c portable for ia64/xen support xen: move events.c to drivers/xen for IA64/Xen support xen: move features.c from arch/x86/xen/features.c to drivers/xen xen: add missing definitions in include/xen/interface/vcpu.h which ia64/xen needs ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/vmi_32.c22
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c4
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/init_32.c8
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pgtable.c276
-rw-r--r--arch/x86/mm/pgtable_32.c204
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c54
-rw-r--r--arch/x86/xen/events.c591
-rw-r--r--arch/x86/xen/features.c29
-rw-r--r--arch/x86/xen/grant-table.c91
-rw-r--r--arch/x86/xen/mmu.c143
-rw-r--r--arch/x86/xen/setup.c21
-rw-r--r--arch/x86/xen/smp.c20
-rw-r--r--arch/x86/xen/xen-asm.S42
-rw-r--r--arch/x86/xen/xen-ops.h8
23 files changed, 572 insertions, 987 deletions
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0f8934fc303..2a609dc3271c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,7 @@ restore_nocheck_notrace:
409irq_return: 409irq_return:
410 INTERRUPT_RETURN 410 INTERRUPT_RETURN
411.section .fixup,"ax" 411.section .fixup,"ax"
412iret_exc: 412ENTRY(iret_exc)
413 pushl $0 # no error code 413 pushl $0 # no error code
414 pushl $do_iret_error 414 pushl $do_iret_error
415 jmp error_code 415 jmp error_code
@@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper)
1017ENDPROC(kernel_thread_helper) 1017ENDPROC(kernel_thread_helper)
1018 1018
1019#ifdef CONFIG_XEN 1019#ifdef CONFIG_XEN
1020/* Xen doesn't set %esp to be precisely what the normal sysenter
1021 entrypoint expects, so fix it up before using the normal path. */
1022ENTRY(xen_sysenter_target)
1023 RING0_INT_FRAME
1024 addl $5*4, %esp /* remove xen-provided frame */
1025 jmp sysenter_past_esp
1026
1020ENTRY(xen_hypervisor_callback) 1027ENTRY(xen_hypervisor_callback)
1021 CFI_STARTPROC 1028 CFI_STARTPROC
1022 pushl $0 1029 pushl $0
@@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback)
1035 cmpl $xen_iret_end_crit,%eax 1042 cmpl $xen_iret_end_crit,%eax
1036 jae 1f 1043 jae 1f
1037 1044
1038 call xen_iret_crit_fixup 1045 jmp xen_iret_crit_fixup
1039 1046
1047ENTRY(xen_do_upcall)
10401: mov %esp, %eax 10481: mov %esp, %eax
1041 call xen_evtchn_do_upcall 1049 call xen_evtchn_do_upcall
1042 jmp ret_from_intr 1050 jmp ret_from_intr
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3733412d1357..74f0c5ea2a03 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = {
366 .flush_tlb_single = native_flush_tlb_single, 366 .flush_tlb_single = native_flush_tlb_single,
367 .flush_tlb_others = native_flush_tlb_others, 367 .flush_tlb_others = native_flush_tlb_others,
368 368
369 .alloc_pt = paravirt_nop, 369 .alloc_pte = paravirt_nop,
370 .alloc_pd = paravirt_nop, 370 .alloc_pmd = paravirt_nop,
371 .alloc_pd_clone = paravirt_nop, 371 .alloc_pmd_clone = paravirt_nop,
372 .release_pt = paravirt_nop, 372 .alloc_pud = paravirt_nop,
373 .release_pd = paravirt_nop, 373 .release_pte = paravirt_nop,
374 .release_pmd = paravirt_nop,
375 .release_pud = paravirt_nop,
374 376
375 .set_pte = native_set_pte, 377 .set_pte = native_set_pte,
376 .set_pte_at = native_set_pte_at, 378 .set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 19c9386ac118..1791a751a772 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -8,6 +8,7 @@
8#include <asm/apic.h> 8#include <asm/apic.h>
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/hpet.h> 10#include <asm/hpet.h>
11#include <asm/pgtable.h>
11#include <asm/reboot_fixups.h> 12#include <asm/reboot_fixups.h>
12#include <asm/reboot.h> 13#include <asm/reboot.h>
13 14
@@ -15,7 +16,6 @@
15# include <linux/dmi.h> 16# include <linux/dmi.h>
16# include <linux/ctype.h> 17# include <linux/ctype.h>
17# include <linux/mc146818rtc.h> 18# include <linux/mc146818rtc.h>
18# include <asm/pgtable.h>
19#else 19#else
20# include <asm/iommu.h> 20# include <asm/iommu.h>
21#endif 21#endif
@@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length)
275 /* Remap the kernel at virtual address zero, as well as offset zero 275 /* Remap the kernel at virtual address zero, as well as offset zero
276 from the kernel segment. This assumes the kernel segment starts at 276 from the kernel segment. This assumes the kernel segment starts at
277 virtual address PAGE_OFFSET. */ 277 virtual address PAGE_OFFSET. */
278 memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 278 memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
279 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); 279 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
280 280
281 /* 281 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ade371f9663a..eef79e84145f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1039 1039
1040#ifdef CONFIG_X86_32 1040#ifdef CONFIG_X86_32
1041 /* init low mem mapping */ 1041 /* init low mem mapping */
1042 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 1042 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1043 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 1043 min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
1044 flush_tlb_all(); 1044 flush_tlb_all();
1045#endif 1045#endif
1046 1046
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 12affe1f9bce..956f38927aa7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page)
320 * pdes need to be zeroed. 320 * pdes need to be zeroed.
321 */ 321 */
322 if (type & VMI_PAGE_CLONE) 322 if (type & VMI_PAGE_CLONE)
323 limit = USER_PTRS_PER_PGD; 323 limit = KERNEL_PGD_BOUNDARY;
324 for (i = 0; i < limit; i++) 324 for (i = 0; i < limit; i++)
325 BUG_ON(ptr[i]); 325 BUG_ON(ptr[i]);
326} 326}
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
392} 392}
393#endif 393#endif
394 394
395static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) 395static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
396{ 396{
397 vmi_set_page_type(pfn, VMI_PAGE_L1); 397 vmi_set_page_type(pfn, VMI_PAGE_L1);
398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
399} 399}
400 400
401static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) 401static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
402{ 402{
403 /* 403 /*
404 * This call comes in very early, before mem_map is setup. 404 * This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
410} 410}
411 411
412static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 412static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
413{ 413{
414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
415 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 415 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
417} 417}
418 418
419static void vmi_release_pt(u32 pfn) 419static void vmi_release_pte(u32 pfn)
420{ 420{
421 vmi_ops.release_page(pfn, VMI_PAGE_L1); 421 vmi_ops.release_page(pfn, VMI_PAGE_L1);
422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
423} 423}
424 424
425static void vmi_release_pd(u32 pfn) 425static void vmi_release_pmd(u32 pfn)
426{ 426{
427 vmi_ops.release_page(pfn, VMI_PAGE_L2); 427 vmi_ops.release_page(pfn, VMI_PAGE_L2);
428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void)
871 871
872 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); 872 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
873 if (vmi_ops.allocate_page) { 873 if (vmi_ops.allocate_page) {
874 pv_mmu_ops.alloc_pt = vmi_allocate_pt; 874 pv_mmu_ops.alloc_pte = vmi_allocate_pte;
875 pv_mmu_ops.alloc_pd = vmi_allocate_pd; 875 pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
876 pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; 876 pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
877 } 877 }
878 878
879 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); 879 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
880 if (vmi_ops.release_page) { 880 if (vmi_ops.release_page) {
881 pv_mmu_ops.release_pt = vmi_release_pt; 881 pv_mmu_ops.release_pte = vmi_release_pte;
882 pv_mmu_ops.release_pd = vmi_release_pd; 882 pv_mmu_ops.release_pmd = vmi_release_pmd;
883 } 883 }
884 884
885 /* Set linear is needed in all cases */ 885 /* Set linear is needed in all cases */
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index d05722121d24..6e2c4efce0ef 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -543,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu)
543 hijack_source.idt.Offset, stack_start.sp)); 543 hijack_source.idt.Offset, stack_start.sp));
544 544
545 /* init lowmem identity mapping */ 545 /* init lowmem identity mapping */
546 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 546 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
547 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 547 min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
548 flush_tlb_all(); 548 flush_tlb_all();
549 549
550 if (quad_boot) { 550 if (quad_boot) {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 20941d2954e2..b7b3e4c7cfc9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o 2 pat.o pgtable.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da85fd7..08aa1878fad4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
73 73
74 paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
77 BUG_ON(pmd_table != pmd_offset(pud, 0)); 77 BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
101 } 101 }
102 102
103 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); 103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
104 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 104 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
105 BUG_ON(page_table != pte_offset_kernel(pmd, 0)); 105 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
106 } 106 }
@@ -365,7 +365,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
365 365
366 pte_clear(NULL, va, pte); 366 pte_clear(NULL, va, pte);
367 } 367 }
368 paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); 368 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
369} 369}
370 370
371void __init native_pagetable_setup_done(pgd_t *base) 371void __init native_pagetable_setup_done(pgd_t *base)
@@ -457,7 +457,7 @@ void zap_low_mappings(void)
457 * Note that "pgd_clear()" doesn't do it for 457 * Note that "pgd_clear()" doesn't do it for
458 * us, because pgd_clear() is a no-op on i386. 458 * us, because pgd_clear() is a no-op on i386.
459 */ 459 */
460 for (i = 0; i < USER_PTRS_PER_PGD; i++) { 460 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
461#ifdef CONFIG_X86_PAE 461#ifdef CONFIG_X86_PAE
462 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 462 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
463#else 463#else
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf95e24d..36a3f7ded626 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -407,7 +407,7 @@ void __init early_ioremap_clear(void)
407 407
408 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 408 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
409 pmd_clear(pmd); 409 pmd_clear(pmd);
410 paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); 410 paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
411 __flush_tlb_all(); 411 __flush_tlb_all();
412} 412}
413 413
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c29ebd037254..bd5e05c654dc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
483 goto out_unlock; 483 goto out_unlock;
484 484
485 pbase = (pte_t *)page_address(base); 485 pbase = (pte_t *)page_address(base);
486#ifdef CONFIG_X86_32 486 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
487 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
488#endif
489 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 487 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
490 488
491#ifdef CONFIG_X86_64 489#ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000000..50159764f694
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,276 @@
1#include <linux/mm.h>
2#include <asm/pgalloc.h>
3#include <asm/pgtable.h>
4#include <asm/tlb.h>
5
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{
8 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
9}
10
11pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
12{
13 struct page *pte;
14
15#ifdef CONFIG_HIGHPTE
16 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17#else
18 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
19#endif
20 if (pte)
21 pgtable_page_ctor(pte);
22 return pte;
23}
24
25void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
26{
27 pgtable_page_dtor(pte);
28 paravirt_release_pte(page_to_pfn(pte));
29 tlb_remove_page(tlb, pte);
30}
31
32#if PAGETABLE_LEVELS > 2
33void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
34{
35 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
36 tlb_remove_page(tlb, virt_to_page(pmd));
37}
38
39#if PAGETABLE_LEVELS > 3
40void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
41{
42 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
43 tlb_remove_page(tlb, virt_to_page(pud));
44}
45#endif /* PAGETABLE_LEVELS > 3 */
46#endif /* PAGETABLE_LEVELS > 2 */
47
48static inline void pgd_list_add(pgd_t *pgd)
49{
50 struct page *page = virt_to_page(pgd);
51
52 list_add(&page->lru, &pgd_list);
53}
54
55static inline void pgd_list_del(pgd_t *pgd)
56{
57 struct page *page = virt_to_page(pgd);
58
59 list_del(&page->lru);
60}
61
62#define UNSHARED_PTRS_PER_PGD \
63 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
64
65static void pgd_ctor(void *p)
66{
67 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74
75 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the
77 references from swapper_pg_dir. */
78 if (PAGETABLE_LEVELS == 2 ||
79 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
80 PAGETABLE_LEVELS == 4) {
81 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
82 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
83 KERNEL_PGD_PTRS);
84 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
85 __pa(swapper_pg_dir) >> PAGE_SHIFT,
86 KERNEL_PGD_BOUNDARY,
87 KERNEL_PGD_PTRS);
88 }
89
90 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95}
96
97static void pgd_dtor(void *pgd)
98{
99 unsigned long flags; /* can be called from interrupt context */
100
101 if (SHARED_KERNEL_PMD)
102 return;
103
104 spin_lock_irqsave(&pgd_lock, flags);
105 pgd_list_del(pgd);
106 spin_unlock_irqrestore(&pgd_lock, flags);
107}
108
109/*
110 * List of all pgd's needed for non-PAE so it can invalidate entries
111 * in both cached and uncached pgd's; not needed for PAE since the
112 * kernel pmd is shared. If PAE were not to share the pmd a similar
113 * tactic would be needed. This is essentially codepath-based locking
114 * against pageattr.c; it is the unique case in which a valid change
115 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
116 * vmalloc faults work because attached pagetables are never freed.
117 * -- wli
118 */
119
120#ifdef CONFIG_X86_PAE
121/*
122 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be
125 * freed manually.
126 */
127static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{
129 int i;
130
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
132 pgd_t pgd = pgdp[i];
133
134 if (pgd_val(pgd) != 0) {
135 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
136
137 pgdp[i] = native_make_pgd(0);
138
139 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
140 pmd_free(mm, pmd);
141 }
142 }
143}
144
145/*
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{
158 pud_t *pud;
159 unsigned long addr;
160 int i;
161
162 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166
167 if (!pmd) {
168 pgd_mop_up_pmds(mm, pgd);
169 return 0;
170 }
171
172 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
174 sizeof(pmd_t) * PTRS_PER_PMD);
175
176 pud_populate(mm, pud, pmd);
177 }
178
179 return 1;
180}
181
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
183{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
185
186 /* Note: almost everything apart from _PAGE_PRESENT is
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189
190 /*
191 * According to Intel App note "TLBs, Paging-Structure Caches,
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
207{
208}
209#endif /* CONFIG_X86_PAE */
210
211pgd_t *pgd_alloc(struct mm_struct *mm)
212{
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214
215 /* so that alloc_pmd can use it */
216 mm->pgd = pgd;
217 if (pgd)
218 pgd_ctor(pgd);
219
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
221 pgd_dtor(pgd);
222 free_page((unsigned long)pgd);
223 pgd = NULL;
224 }
225
226 return pgd;
227}
228
229void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{
231 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd);
233 free_page((unsigned long)pgd);
234}
235
236int ptep_set_access_flags(struct vm_area_struct *vma,
237 unsigned long address, pte_t *ptep,
238 pte_t entry, int dirty)
239{
240 int changed = !pte_same(*ptep, entry);
241
242 if (changed && dirty) {
243 *ptep = entry;
244 pte_update_defer(vma->vm_mm, address, ptep);
245 flush_tlb_page(vma, address);
246 }
247
248 return changed;
249}
250
251int ptep_test_and_clear_young(struct vm_area_struct *vma,
252 unsigned long addr, pte_t *ptep)
253{
254 int ret = 0;
255
256 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte);
259
260 if (ret)
261 pte_update(vma->vm_mm, addr, ptep);
262
263 return ret;
264}
265
266int ptep_clear_flush_young(struct vm_area_struct *vma,
267 unsigned long address, pte_t *ptep)
268{
269 int young;
270
271 young = ptep_test_and_clear_young(vma, address, ptep);
272 if (young)
273 flush_tlb_page(vma, address);
274
275 return young;
276}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6fb9e7c6893f..9ee007be9142 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve)
173 __VMALLOC_RESERVE += reserve; 173 __VMALLOC_RESERVE += reserve;
174} 174}
175 175
176pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
177{
178 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
179}
180
181pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
182{
183 struct page *pte;
184
185#ifdef CONFIG_HIGHPTE
186 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
187#else
188 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
189#endif
190 if (pte)
191 pgtable_page_ctor(pte);
192 return pte;
193}
194
195/*
196 * List of all pgd's needed for non-PAE so it can invalidate entries
197 * in both cached and uncached pgd's; not needed for PAE since the
198 * kernel pmd is shared. If PAE were not to share the pmd a similar
199 * tactic would be needed. This is essentially codepath-based locking
200 * against pageattr.c; it is the unique case in which a valid change
201 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
202 * vmalloc faults work because attached pagetables are never freed.
203 * -- wli
204 */
205static inline void pgd_list_add(pgd_t *pgd)
206{
207 struct page *page = virt_to_page(pgd);
208
209 list_add(&page->lru, &pgd_list);
210}
211
212static inline void pgd_list_del(pgd_t *pgd)
213{
214 struct page *page = virt_to_page(pgd);
215
216 list_del(&page->lru);
217}
218
219#define UNSHARED_PTRS_PER_PGD \
220 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
221
222static void pgd_ctor(void *p)
223{
224 pgd_t *pgd = p;
225 unsigned long flags;
226
227 /* Clear usermode parts of PGD */
228 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
229
230 spin_lock_irqsave(&pgd_lock, flags);
231
232 /* If the pgd points to a shared pagetable level (either the
233 ptes in non-PAE, or shared PMD in PAE), then just copy the
234 references from swapper_pg_dir. */
235 if (PAGETABLE_LEVELS == 2 ||
236 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
237 clone_pgd_range(pgd + USER_PTRS_PER_PGD,
238 swapper_pg_dir + USER_PTRS_PER_PGD,
239 KERNEL_PGD_PTRS);
240 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
241 __pa(swapper_pg_dir) >> PAGE_SHIFT,
242 USER_PTRS_PER_PGD,
243 KERNEL_PGD_PTRS);
244 }
245
246 /* list required to sync kernel mapping updates */
247 if (!SHARED_KERNEL_PMD)
248 pgd_list_add(pgd);
249
250 spin_unlock_irqrestore(&pgd_lock, flags);
251}
252
253static void pgd_dtor(void *pgd)
254{
255 unsigned long flags; /* can be called from interrupt context */
256
257 if (SHARED_KERNEL_PMD)
258 return;
259
260 spin_lock_irqsave(&pgd_lock, flags);
261 pgd_list_del(pgd);
262 spin_unlock_irqrestore(&pgd_lock, flags);
263}
264
265#ifdef CONFIG_X86_PAE
266/*
267 * Mop up any pmd pages which may still be attached to the pgd.
268 * Normally they will be freed by munmap/exit_mmap, but any pmd we
269 * preallocate which never got a corresponding vma will need to be
270 * freed manually.
271 */
272static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
273{
274 int i;
275
276 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
277 pgd_t pgd = pgdp[i];
278
279 if (pgd_val(pgd) != 0) {
280 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
281
282 pgdp[i] = native_make_pgd(0);
283
284 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
285 pmd_free(mm, pmd);
286 }
287 }
288}
289
290/*
291 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
292 * updating the top-level pagetable entries to guarantee the
293 * processor notices the update. Since this is expensive, and
294 * all 4 top-level entries are used almost immediately in a
295 * new process's life, we just pre-populate them here.
296 *
297 * Also, if we're in a paravirt environment where the kernel pmd is
298 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
299 * and initialize the kernel pmds here.
300 */
301static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
302{
303 pud_t *pud;
304 unsigned long addr;
305 int i;
306
307 pud = pud_offset(pgd, 0);
308 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
309 i++, pud++, addr += PUD_SIZE) {
310 pmd_t *pmd = pmd_alloc_one(mm, addr);
311
312 if (!pmd) {
313 pgd_mop_up_pmds(mm, pgd);
314 return 0;
315 }
316
317 if (i >= USER_PTRS_PER_PGD)
318 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
319 sizeof(pmd_t) * PTRS_PER_PMD);
320
321 pud_populate(mm, pud, pmd);
322 }
323
324 return 1;
325}
326#else /* !CONFIG_X86_PAE */
327/* No need to prepopulate any pagetable entries in non-PAE modes. */
328static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
329{
330 return 1;
331}
332
333static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
334{
335}
336#endif /* CONFIG_X86_PAE */
337
338pgd_t *pgd_alloc(struct mm_struct *mm)
339{
340 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
341
342 /* so that alloc_pd can use it */
343 mm->pgd = pgd;
344 if (pgd)
345 pgd_ctor(pgd);
346
347 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
348 pgd_dtor(pgd);
349 free_page((unsigned long)pgd);
350 pgd = NULL;
351 }
352
353 return pgd;
354}
355
356void pgd_free(struct mm_struct *mm, pgd_t *pgd)
357{
358 pgd_mop_up_pmds(mm, pgd);
359 pgd_dtor(pgd);
360 free_page((unsigned long)pgd);
361}
362
363void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
364{
365 pgtable_page_dtor(pte);
366 paravirt_release_pt(page_to_pfn(pte));
367 tlb_remove_page(tlb, pte);
368}
369
370#ifdef CONFIG_X86_PAE
371
372void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
373{
374 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
375 tlb_remove_page(tlb, virt_to_page(pmd));
376}
377
378#endif
379
380int pmd_bad(pmd_t pmd) 176int pmd_bad(pmd_t pmd)
381{ 177{
382 WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); 178 WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 4d5f2649bee4..2e641be2737e 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 depends on X86_32 8 depends on X86_32
9 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) 9 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
10 help 10 help
11 This is the Linux Xen port. Enabling this will allow the 11 This is the Linux Xen port. Enabling this will allow the
12 kernel to boot in a paravirtualized environment under the 12 kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 343df246bd3e..3d8df981d5fd 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 events.o time.o manage.o xen-asm.o 2 time.o manage.o xen-asm.o grant-table.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0388220cf97..c8a56e457d61 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -155,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
155 if (*ax == 1) 155 if (*ax == 1)
156 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 156 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
157 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 157 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
158 (1 << X86_FEATURE_SEP) | /* disable SEP */ 158 (1 << X86_FEATURE_MCE) | /* disable MCE */
159 (1 << X86_FEATURE_MCA) | /* disable MCA */
159 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 160 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
160 161
161 asm(XEN_EMULATE_PREFIX "cpuid" 162 asm(XEN_EMULATE_PREFIX "cpuid"
@@ -531,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val)
531static void xen_flush_tlb(void) 532static void xen_flush_tlb(void)
532{ 533{
533 struct mmuext_op *op; 534 struct mmuext_op *op;
534 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 535 struct multicall_space mcs;
536
537 preempt_disable();
538
539 mcs = xen_mc_entry(sizeof(*op));
535 540
536 op = mcs.args; 541 op = mcs.args;
537 op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 542 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
538 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 543 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
539 544
540 xen_mc_issue(PARAVIRT_LAZY_MMU); 545 xen_mc_issue(PARAVIRT_LAZY_MMU);
546
547 preempt_enable();
541} 548}
542 549
543static void xen_flush_tlb_single(unsigned long addr) 550static void xen_flush_tlb_single(unsigned long addr)
544{ 551{
545 struct mmuext_op *op; 552 struct mmuext_op *op;
546 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 553 struct multicall_space mcs;
554
555 preempt_disable();
547 556
557 mcs = xen_mc_entry(sizeof(*op));
548 op = mcs.args; 558 op = mcs.args;
549 op->cmd = MMUEXT_INVLPG_LOCAL; 559 op->cmd = MMUEXT_INVLPG_LOCAL;
550 op->arg1.linear_addr = addr & PAGE_MASK; 560 op->arg1.linear_addr = addr & PAGE_MASK;
551 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 561 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
552 562
553 xen_mc_issue(PARAVIRT_LAZY_MMU); 563 xen_mc_issue(PARAVIRT_LAZY_MMU);
564
565 preempt_enable();
554} 566}
555 567
556static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 568static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
@@ -655,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3)
655 667
656/* Early in boot, while setting up the initial pagetable, assume 668/* Early in boot, while setting up the initial pagetable, assume
657 everything is pinned. */ 669 everything is pinned. */
658static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) 670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
659{ 671{
672#ifdef CONFIG_FLATMEM
660 BUG_ON(mem_map); /* should only be used early */ 673 BUG_ON(mem_map); /* should only be used early */
674#endif
661 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 675 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
662} 676}
663 677
664/* Early release_pt assumes that all pts are pinned, since there's 678/* Early release_pte assumes that all pts are pinned, since there's
665 only init_mm and anything attached to that is pinned. */ 679 only init_mm and anything attached to that is pinned. */
666static void xen_release_pt_init(u32 pfn) 680static void xen_release_pte_init(u32 pfn)
667{ 681{
668 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 682 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
669} 683}
@@ -697,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
697 } 711 }
698} 712}
699 713
700static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 714static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
701{ 715{
702 xen_alloc_ptpage(mm, pfn, PT_PTE); 716 xen_alloc_ptpage(mm, pfn, PT_PTE);
703} 717}
704 718
705static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) 719static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
706{ 720{
707 xen_alloc_ptpage(mm, pfn, PT_PMD); 721 xen_alloc_ptpage(mm, pfn, PT_PMD);
708} 722}
@@ -722,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
722 } 736 }
723} 737}
724 738
725static void xen_release_pt(u32 pfn) 739static void xen_release_pte(u32 pfn)
726{ 740{
727 xen_release_ptpage(pfn, PT_PTE); 741 xen_release_ptpage(pfn, PT_PTE);
728} 742}
729 743
730static void xen_release_pd(u32 pfn) 744static void xen_release_pmd(u32 pfn)
731{ 745{
732 xen_release_ptpage(pfn, PT_PMD); 746 xen_release_ptpage(pfn, PT_PMD);
733} 747}
@@ -849,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
849{ 863{
850 /* This will work as long as patching hasn't happened yet 864 /* This will work as long as patching hasn't happened yet
851 (which it hasn't) */ 865 (which it hasn't) */
852 pv_mmu_ops.alloc_pt = xen_alloc_pt; 866 pv_mmu_ops.alloc_pte = xen_alloc_pte;
853 pv_mmu_ops.alloc_pd = xen_alloc_pd; 867 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
854 pv_mmu_ops.release_pt = xen_release_pt; 868 pv_mmu_ops.release_pte = xen_release_pte;
855 pv_mmu_ops.release_pd = xen_release_pd; 869 pv_mmu_ops.release_pmd = xen_release_pmd;
856 pv_mmu_ops.set_pte = xen_set_pte; 870 pv_mmu_ops.set_pte = xen_set_pte;
857 871
858 setup_shared_info(); 872 setup_shared_info();
@@ -994,7 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
994 .read_pmc = native_read_pmc, 1008 .read_pmc = native_read_pmc,
995 1009
996 .iret = xen_iret, 1010 .iret = xen_iret,
997 .irq_enable_syscall_ret = NULL, /* never called */ 1011 .irq_enable_syscall_ret = xen_sysexit,
998 1012
999 .load_tr_desc = paravirt_nop, 1013 .load_tr_desc = paravirt_nop,
1000 .set_ldt = xen_set_ldt, 1014 .set_ldt = xen_set_ldt,
@@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1059 .pte_update = paravirt_nop, 1073 .pte_update = paravirt_nop,
1060 .pte_update_defer = paravirt_nop, 1074 .pte_update_defer = paravirt_nop,
1061 1075
1062 .alloc_pt = xen_alloc_pt_init, 1076 .alloc_pte = xen_alloc_pte_init,
1063 .release_pt = xen_release_pt_init, 1077 .release_pte = xen_release_pte_init,
1064 .alloc_pd = xen_alloc_pt_init, 1078 .alloc_pmd = xen_alloc_pte_init,
1065 .alloc_pd_clone = paravirt_nop, 1079 .alloc_pmd_clone = paravirt_nop,
1066 .release_pd = xen_release_pt_init, 1080 .release_pmd = xen_release_pte_init,
1067 1081
1068#ifdef CONFIG_HIGHPTE 1082#ifdef CONFIG_HIGHPTE
1069 .kmap_atomic_pte = xen_kmap_atomic_pte, 1083 .kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
deleted file mode 100644
index dcf613e17581..000000000000
--- a/arch/x86/xen/events.c
+++ /dev/null
@@ -1,591 +0,0 @@
1/*
2 * Xen event channels
3 *
4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is recieved, it is mapped to an irq and sent
9 * through the normal interrupt processing path.
10 *
11 * There are four kinds of events which can be mapped to an event
12 * channel:
13 *
14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs.
19 * 4. Hardware interrupts. Not supported at present.
20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */
23
24#include <linux/linkage.h>
25#include <linux/interrupt.h>
26#include <linux/irq.h>
27#include <linux/module.h>
28#include <linux/string.h>
29
30#include <asm/ptrace.h>
31#include <asm/irq.h>
32#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h>
34#include <asm/xen/hypervisor.h>
35
36#include <xen/events.h>
37#include <xen/interface/xen.h>
38#include <xen/interface/event_channel.h>
39
40#include "xen-ops.h"
41
42/*
43 * This lock protects updates to the following mapping and reference-count
44 * arrays. The lock does not need to be acquired to read the mapping tables.
45 */
46static DEFINE_SPINLOCK(irq_mapping_update_lock);
47
48/* IRQ <-> VIRQ mapping. */
49static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
50
51/* IRQ <-> IPI mapping */
52static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
53
54/* Packed IRQ information: binding type, sub-type index, and event channel. */
55struct packed_irq
56{
57 unsigned short evtchn;
58 unsigned char index;
59 unsigned char type;
60};
61
62static struct packed_irq irq_info[NR_IRQS];
63
64/* Binding types. */
65enum {
66 IRQT_UNBOUND,
67 IRQT_PIRQ,
68 IRQT_VIRQ,
69 IRQT_IPI,
70 IRQT_EVTCHN
71};
72
73/* Convenient shorthand for packed representation of an unbound IRQ. */
74#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
75
76static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
77 [0 ... NR_EVENT_CHANNELS-1] = -1
78};
79static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
80static u8 cpu_evtchn[NR_EVENT_CHANNELS];
81
82/* Reference counts for bindings to IRQs. */
83static int irq_bindcount[NR_IRQS];
84
85/* Xen will never allocate port zero for any purpose. */
86#define VALID_EVTCHN(chn) ((chn) != 0)
87
88/*
89 * Force a proper event-channel callback from Xen after clearing the
90 * callback mask. We do this in a very simple manner, by making a call
91 * down into Xen. The pending flag will be checked by Xen on return.
92 */
93void force_evtchn_callback(void)
94{
95 (void)HYPERVISOR_xen_version(0, NULL);
96}
97EXPORT_SYMBOL_GPL(force_evtchn_callback);
98
99static struct irq_chip xen_dynamic_chip;
100
101/* Constructor for packed IRQ information. */
102static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
103{
104 return (struct packed_irq) { evtchn, index, type };
105}
106
107/*
108 * Accessors for packed IRQ information.
109 */
110static inline unsigned int evtchn_from_irq(int irq)
111{
112 return irq_info[irq].evtchn;
113}
114
115static inline unsigned int index_from_irq(int irq)
116{
117 return irq_info[irq].index;
118}
119
120static inline unsigned int type_from_irq(int irq)
121{
122 return irq_info[irq].type;
123}
124
125static inline unsigned long active_evtchns(unsigned int cpu,
126 struct shared_info *sh,
127 unsigned int idx)
128{
129 return (sh->evtchn_pending[idx] &
130 cpu_evtchn_mask[cpu][idx] &
131 ~sh->evtchn_mask[idx]);
132}
133
134static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
135{
136 int irq = evtchn_to_irq[chn];
137
138 BUG_ON(irq == -1);
139#ifdef CONFIG_SMP
140 irq_desc[irq].affinity = cpumask_of_cpu(cpu);
141#endif
142
143 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
144 __set_bit(chn, cpu_evtchn_mask[cpu]);
145
146 cpu_evtchn[chn] = cpu;
147}
148
149static void init_evtchn_cpu_bindings(void)
150{
151#ifdef CONFIG_SMP
152 int i;
153 /* By default all event channels notify CPU#0. */
154 for (i = 0; i < NR_IRQS; i++)
155 irq_desc[i].affinity = cpumask_of_cpu(0);
156#endif
157
158 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
159 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
160}
161
162static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
163{
164 return cpu_evtchn[evtchn];
165}
166
167static inline void clear_evtchn(int port)
168{
169 struct shared_info *s = HYPERVISOR_shared_info;
170 sync_clear_bit(port, &s->evtchn_pending[0]);
171}
172
173static inline void set_evtchn(int port)
174{
175 struct shared_info *s = HYPERVISOR_shared_info;
176 sync_set_bit(port, &s->evtchn_pending[0]);
177}
178
179
180/**
181 * notify_remote_via_irq - send event to remote end of event channel via irq
182 * @irq: irq of event channel to send event to
183 *
184 * Unlike notify_remote_via_evtchn(), this is safe to use across
185 * save/restore. Notifications on a broken connection are silently
186 * dropped.
187 */
188void notify_remote_via_irq(int irq)
189{
190 int evtchn = evtchn_from_irq(irq);
191
192 if (VALID_EVTCHN(evtchn))
193 notify_remote_via_evtchn(evtchn);
194}
195EXPORT_SYMBOL_GPL(notify_remote_via_irq);
196
197static void mask_evtchn(int port)
198{
199 struct shared_info *s = HYPERVISOR_shared_info;
200 sync_set_bit(port, &s->evtchn_mask[0]);
201}
202
203static void unmask_evtchn(int port)
204{
205 struct shared_info *s = HYPERVISOR_shared_info;
206 unsigned int cpu = get_cpu();
207
208 BUG_ON(!irqs_disabled());
209
210 /* Slow path (hypercall) if this is a non-local port. */
211 if (unlikely(cpu != cpu_from_evtchn(port))) {
212 struct evtchn_unmask unmask = { .port = port };
213 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
214 } else {
215 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
216
217 sync_clear_bit(port, &s->evtchn_mask[0]);
218
219 /*
220 * The following is basically the equivalent of
221 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
222 * the interrupt edge' if the channel is masked.
223 */
224 if (sync_test_bit(port, &s->evtchn_pending[0]) &&
225 !sync_test_and_set_bit(port / BITS_PER_LONG,
226 &vcpu_info->evtchn_pending_sel))
227 vcpu_info->evtchn_upcall_pending = 1;
228 }
229
230 put_cpu();
231}
232
233static int find_unbound_irq(void)
234{
235 int irq;
236
237 /* Only allocate from dynirq range */
238 for (irq = 0; irq < NR_IRQS; irq++)
239 if (irq_bindcount[irq] == 0)
240 break;
241
242 if (irq == NR_IRQS)
243 panic("No available IRQ to bind to: increase NR_IRQS!\n");
244
245 return irq;
246}
247
248int bind_evtchn_to_irq(unsigned int evtchn)
249{
250 int irq;
251
252 spin_lock(&irq_mapping_update_lock);
253
254 irq = evtchn_to_irq[evtchn];
255
256 if (irq == -1) {
257 irq = find_unbound_irq();
258
259 dynamic_irq_init(irq);
260 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
261 handle_level_irq, "event");
262
263 evtchn_to_irq[evtchn] = irq;
264 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
265 }
266
267 irq_bindcount[irq]++;
268
269 spin_unlock(&irq_mapping_update_lock);
270
271 return irq;
272}
273EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
274
275static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
276{
277 struct evtchn_bind_ipi bind_ipi;
278 int evtchn, irq;
279
280 spin_lock(&irq_mapping_update_lock);
281
282 irq = per_cpu(ipi_to_irq, cpu)[ipi];
283 if (irq == -1) {
284 irq = find_unbound_irq();
285 if (irq < 0)
286 goto out;
287
288 dynamic_irq_init(irq);
289 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
290 handle_level_irq, "ipi");
291
292 bind_ipi.vcpu = cpu;
293 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
294 &bind_ipi) != 0)
295 BUG();
296 evtchn = bind_ipi.port;
297
298 evtchn_to_irq[evtchn] = irq;
299 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
300
301 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
302
303 bind_evtchn_to_cpu(evtchn, cpu);
304 }
305
306 irq_bindcount[irq]++;
307
308 out:
309 spin_unlock(&irq_mapping_update_lock);
310 return irq;
311}
312
313
314static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
315{
316 struct evtchn_bind_virq bind_virq;
317 int evtchn, irq;
318
319 spin_lock(&irq_mapping_update_lock);
320
321 irq = per_cpu(virq_to_irq, cpu)[virq];
322
323 if (irq == -1) {
324 bind_virq.virq = virq;
325 bind_virq.vcpu = cpu;
326 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
327 &bind_virq) != 0)
328 BUG();
329 evtchn = bind_virq.port;
330
331 irq = find_unbound_irq();
332
333 dynamic_irq_init(irq);
334 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
335 handle_level_irq, "virq");
336
337 evtchn_to_irq[evtchn] = irq;
338 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
339
340 per_cpu(virq_to_irq, cpu)[virq] = irq;
341
342 bind_evtchn_to_cpu(evtchn, cpu);
343 }
344
345 irq_bindcount[irq]++;
346
347 spin_unlock(&irq_mapping_update_lock);
348
349 return irq;
350}
351
352static void unbind_from_irq(unsigned int irq)
353{
354 struct evtchn_close close;
355 int evtchn = evtchn_from_irq(irq);
356
357 spin_lock(&irq_mapping_update_lock);
358
359 if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
360 close.port = evtchn;
361 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
362 BUG();
363
364 switch (type_from_irq(irq)) {
365 case IRQT_VIRQ:
366 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
367 [index_from_irq(irq)] = -1;
368 break;
369 default:
370 break;
371 }
372
373 /* Closed ports are implicitly re-bound to VCPU0. */
374 bind_evtchn_to_cpu(evtchn, 0);
375
376 evtchn_to_irq[evtchn] = -1;
377 irq_info[irq] = IRQ_UNBOUND;
378
379 dynamic_irq_init(irq);
380 }
381
382 spin_unlock(&irq_mapping_update_lock);
383}
384
385int bind_evtchn_to_irqhandler(unsigned int evtchn,
386 irq_handler_t handler,
387 unsigned long irqflags,
388 const char *devname, void *dev_id)
389{
390 unsigned int irq;
391 int retval;
392
393 irq = bind_evtchn_to_irq(evtchn);
394 retval = request_irq(irq, handler, irqflags, devname, dev_id);
395 if (retval != 0) {
396 unbind_from_irq(irq);
397 return retval;
398 }
399
400 return irq;
401}
402EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
403
404int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
405 irq_handler_t handler,
406 unsigned long irqflags, const char *devname, void *dev_id)
407{
408 unsigned int irq;
409 int retval;
410
411 irq = bind_virq_to_irq(virq, cpu);
412 retval = request_irq(irq, handler, irqflags, devname, dev_id);
413 if (retval != 0) {
414 unbind_from_irq(irq);
415 return retval;
416 }
417
418 return irq;
419}
420EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
421
422int bind_ipi_to_irqhandler(enum ipi_vector ipi,
423 unsigned int cpu,
424 irq_handler_t handler,
425 unsigned long irqflags,
426 const char *devname,
427 void *dev_id)
428{
429 int irq, retval;
430
431 irq = bind_ipi_to_irq(ipi, cpu);
432 if (irq < 0)
433 return irq;
434
435 retval = request_irq(irq, handler, irqflags, devname, dev_id);
436 if (retval != 0) {
437 unbind_from_irq(irq);
438 return retval;
439 }
440
441 return irq;
442}
443
444void unbind_from_irqhandler(unsigned int irq, void *dev_id)
445{
446 free_irq(irq, dev_id);
447 unbind_from_irq(irq);
448}
449EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
450
451void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
452{
453 int irq = per_cpu(ipi_to_irq, cpu)[vector];
454 BUG_ON(irq < 0);
455 notify_remote_via_irq(irq);
456}
457
458
459/*
460 * Search the CPUs pending events bitmasks. For each one found, map
461 * the event number to an irq, and feed it into do_IRQ() for
462 * handling.
463 *
464 * Xen uses a two-level bitmap to speed searching. The first level is
465 * a bitset of words which contain pending event bits. The second
466 * level is a bitset of pending events themselves.
467 */
468void xen_evtchn_do_upcall(struct pt_regs *regs)
469{
470 int cpu = get_cpu();
471 struct shared_info *s = HYPERVISOR_shared_info;
472 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
473 unsigned long pending_words;
474
475 vcpu_info->evtchn_upcall_pending = 0;
476
477 /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
478 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
479 while (pending_words != 0) {
480 unsigned long pending_bits;
481 int word_idx = __ffs(pending_words);
482 pending_words &= ~(1UL << word_idx);
483
484 while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
485 int bit_idx = __ffs(pending_bits);
486 int port = (word_idx * BITS_PER_LONG) + bit_idx;
487 int irq = evtchn_to_irq[port];
488
489 if (irq != -1) {
490 regs->orig_ax = ~irq;
491 do_IRQ(regs);
492 }
493 }
494 }
495
496 put_cpu();
497}
498
499/* Rebind an evtchn so that it gets delivered to a specific cpu */
500static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
501{
502 struct evtchn_bind_vcpu bind_vcpu;
503 int evtchn = evtchn_from_irq(irq);
504
505 if (!VALID_EVTCHN(evtchn))
506 return;
507
508 /* Send future instances of this interrupt to other vcpu. */
509 bind_vcpu.port = evtchn;
510 bind_vcpu.vcpu = tcpu;
511
512 /*
513 * If this fails, it usually just indicates that we're dealing with a
514 * virq or IPI channel, which don't actually need to be rebound. Ignore
515 * it, but don't do the xenlinux-level rebind in that case.
516 */
517 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
518 bind_evtchn_to_cpu(evtchn, tcpu);
519}
520
521
522static void set_affinity_irq(unsigned irq, cpumask_t dest)
523{
524 unsigned tcpu = first_cpu(dest);
525 rebind_irq_to_cpu(irq, tcpu);
526}
527
528static void enable_dynirq(unsigned int irq)
529{
530 int evtchn = evtchn_from_irq(irq);
531
532 if (VALID_EVTCHN(evtchn))
533 unmask_evtchn(evtchn);
534}
535
536static void disable_dynirq(unsigned int irq)
537{
538 int evtchn = evtchn_from_irq(irq);
539
540 if (VALID_EVTCHN(evtchn))
541 mask_evtchn(evtchn);
542}
543
544static void ack_dynirq(unsigned int irq)
545{
546 int evtchn = evtchn_from_irq(irq);
547
548 move_native_irq(irq);
549
550 if (VALID_EVTCHN(evtchn))
551 clear_evtchn(evtchn);
552}
553
554static int retrigger_dynirq(unsigned int irq)
555{
556 int evtchn = evtchn_from_irq(irq);
557 int ret = 0;
558
559 if (VALID_EVTCHN(evtchn)) {
560 set_evtchn(evtchn);
561 ret = 1;
562 }
563
564 return ret;
565}
566
567static struct irq_chip xen_dynamic_chip __read_mostly = {
568 .name = "xen-dyn",
569 .mask = disable_dynirq,
570 .unmask = enable_dynirq,
571 .ack = ack_dynirq,
572 .set_affinity = set_affinity_irq,
573 .retrigger = retrigger_dynirq,
574};
575
576void __init xen_init_IRQ(void)
577{
578 int i;
579
580 init_evtchn_cpu_bindings();
581
582 /* No event channels are 'live' right now. */
583 for (i = 0; i < NR_EVENT_CHANNELS; i++)
584 mask_evtchn(i);
585
586 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
587 for (i = 0; i < NR_IRQS; i++)
588 irq_bindcount[i] = 0;
589
590 irq_ctx_init(smp_processor_id());
591}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
deleted file mode 100644
index 0707714e40d6..000000000000
--- a/arch/x86/xen/features.c
+++ /dev/null
@@ -1,29 +0,0 @@
1/******************************************************************************
2 * features.c
3 *
4 * Xen feature flags.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Inc.
7 */
8#include <linux/types.h>
9#include <linux/cache.h>
10#include <linux/module.h>
11#include <asm/xen/hypervisor.h>
12#include <xen/features.h>
13
14u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
15EXPORT_SYMBOL_GPL(xen_features);
16
17void xen_setup_features(void)
18{
19 struct xen_feature_info fi;
20 int i, j;
21
22 for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
23 fi.submap_idx = i;
24 if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
25 break;
26 for (j = 0; j < 32; j++)
27 xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
28 }
29}
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 000000000000..49ba9b5224d1
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,91 @@
1/******************************************************************************
2 * grant_table.c
3 * x86 specific part
4 *
5 * Granting foreign access to our memory reservation.
6 *
7 * Copyright (c) 2005-2006, Christopher Clark
8 * Copyright (c) 2004-2005, K A Fraser
9 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
10 * VA Linux Systems Japan. Split out x86 specific part.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 * IN THE SOFTWARE.
35 */
36
37#include <linux/sched.h>
38#include <linux/mm.h>
39#include <linux/vmalloc.h>
40
41#include <xen/interface/xen.h>
42#include <xen/page.h>
43#include <xen/grant_table.h>
44
45#include <asm/pgtable.h>
46
47static int map_pte_fn(pte_t *pte, struct page *pmd_page,
48 unsigned long addr, void *data)
49{
50 unsigned long **frames = (unsigned long **)data;
51
52 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
53 (*frames)++;
54 return 0;
55}
56
57static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
58 unsigned long addr, void *data)
59{
60
61 set_pte_at(&init_mm, addr, pte, __pte(0));
62 return 0;
63}
64
65int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
66 unsigned long max_nr_gframes,
67 struct grant_entry **__shared)
68{
69 int rc;
70 struct grant_entry *shared = *__shared;
71
72 if (shared == NULL) {
73 struct vm_struct *area =
74 xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
75 BUG_ON(area == NULL);
76 shared = area->addr;
77 *__shared = shared;
78 }
79
80 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
81 PAGE_SIZE * nr_gframes,
82 map_pte_fn, &frames);
83 return rc;
84}
85
86void arch_gnttab_unmap_shared(struct grant_entry *shared,
87 unsigned long nr_gframes)
88{
89 apply_to_page_range(&init_mm, (unsigned long)shared,
90 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
91}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2a054ef2a3da..6cbcf65609ad 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
157 pte_t *ptep, pte_t pteval) 157 pte_t *ptep, pte_t pteval)
158{ 158{
159 /* updates to init_mm may be done without lock */
160 if (mm == &init_mm)
161 preempt_disable();
162
159 if (mm == current->mm || mm == &init_mm) { 163 if (mm == current->mm || mm == &init_mm) {
160 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
161 struct multicall_space mcs; 165 struct multicall_space mcs;
@@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
163 167
164 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 168 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
165 xen_mc_issue(PARAVIRT_LAZY_MMU); 169 xen_mc_issue(PARAVIRT_LAZY_MMU);
166 return; 170 goto out;
167 } else 171 } else
168 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) 172 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
169 return; 173 goto out;
170 } 174 }
171 xen_set_pte(ptep, pteval); 175 xen_set_pte(ptep, pteval);
176
177out:
178 if (mm == &init_mm)
179 preempt_enable();
180}
181
182pteval_t xen_pte_val(pte_t pte)
183{
184 pteval_t ret = pte.pte;
185
186 if (ret & _PAGE_PRESENT)
187 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
188
189 return ret;
190}
191
192pgdval_t xen_pgd_val(pgd_t pgd)
193{
194 pgdval_t ret = pgd.pgd;
195 if (ret & _PAGE_PRESENT)
196 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
197 return ret;
198}
199
200pte_t xen_make_pte(pteval_t pte)
201{
202 if (pte & _PAGE_PRESENT) {
203 pte = phys_to_machine(XPADDR(pte)).maddr;
204 pte &= ~(_PAGE_PCD | _PAGE_PWT);
205 }
206
207 return (pte_t){ .pte = pte };
172} 208}
173 209
210pgd_t xen_make_pgd(pgdval_t pgd)
211{
212 if (pgd & _PAGE_PRESENT)
213 pgd = phys_to_machine(XPADDR(pgd)).maddr;
214
215 return (pgd_t){ pgd };
216}
217
218pmdval_t xen_pmd_val(pmd_t pmd)
219{
220 pmdval_t ret = native_pmd_val(pmd);
221 if (ret & _PAGE_PRESENT)
222 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
223 return ret;
224}
174#ifdef CONFIG_X86_PAE 225#ifdef CONFIG_X86_PAE
175void xen_set_pud(pud_t *ptr, pud_t val) 226void xen_set_pud(pud_t *ptr, pud_t val)
176{ 227{
@@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp)
214 xen_set_pmd(pmdp, __pmd(0)); 265 xen_set_pmd(pmdp, __pmd(0));
215} 266}
216 267
217unsigned long long xen_pte_val(pte_t pte) 268pmd_t xen_make_pmd(pmdval_t pmd)
218{ 269{
219 unsigned long long ret = 0; 270 if (pmd & _PAGE_PRESENT)
220
221 if (pte.pte_low) {
222 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
223 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
224 }
225
226 return ret;
227}
228
229unsigned long long xen_pmd_val(pmd_t pmd)
230{
231 unsigned long long ret = pmd.pmd;
232 if (ret)
233 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
234 return ret;
235}
236
237unsigned long long xen_pgd_val(pgd_t pgd)
238{
239 unsigned long long ret = pgd.pgd;
240 if (ret)
241 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
242 return ret;
243}
244
245pte_t xen_make_pte(unsigned long long pte)
246{
247 if (pte & _PAGE_PRESENT) {
248 pte = phys_to_machine(XPADDR(pte)).maddr;
249 pte &= ~(_PAGE_PCD | _PAGE_PWT);
250 }
251
252 return (pte_t){ .pte = pte };
253}
254
255pmd_t xen_make_pmd(unsigned long long pmd)
256{
257 if (pmd & 1)
258 pmd = phys_to_machine(XPADDR(pmd)).maddr; 271 pmd = phys_to_machine(XPADDR(pmd)).maddr;
259 272
260 return (pmd_t){ pmd }; 273 return native_make_pmd(pmd);
261}
262
263pgd_t xen_make_pgd(unsigned long long pgd)
264{
265 if (pgd & _PAGE_PRESENT)
266 pgd = phys_to_machine(XPADDR(pgd)).maddr;
267
268 return (pgd_t){ pgd };
269} 274}
270#else /* !PAE */ 275#else /* !PAE */
271void xen_set_pte(pte_t *ptep, pte_t pte) 276void xen_set_pte(pte_t *ptep, pte_t pte)
272{ 277{
273 *ptep = pte; 278 *ptep = pte;
274} 279}
275
276unsigned long xen_pte_val(pte_t pte)
277{
278 unsigned long ret = pte.pte_low;
279
280 if (ret & _PAGE_PRESENT)
281 ret = machine_to_phys(XMADDR(ret)).paddr;
282
283 return ret;
284}
285
286unsigned long xen_pgd_val(pgd_t pgd)
287{
288 unsigned long ret = pgd.pgd;
289 if (ret)
290 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
291 return ret;
292}
293
294pte_t xen_make_pte(unsigned long pte)
295{
296 if (pte & _PAGE_PRESENT) {
297 pte = phys_to_machine(XPADDR(pte)).maddr;
298 pte &= ~(_PAGE_PCD | _PAGE_PWT);
299 }
300
301 return (pte_t){ pte };
302}
303
304pgd_t xen_make_pgd(unsigned long pgd)
305{
306 if (pgd & _PAGE_PRESENT)
307 pgd = phys_to_machine(XPADDR(pgd)).maddr;
308
309 return (pgd_t){ pgd };
310}
311#endif /* CONFIG_X86_PAE */ 280#endif /* CONFIG_X86_PAE */
312 281
313/* 282/*
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2341492bf7a0..82517e4a752a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -16,6 +16,7 @@
16#include <asm/xen/hypervisor.h> 16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 17#include <asm/xen/hypercall.h>
18 18
19#include <xen/interface/callback.h>
19#include <xen/interface/physdev.h> 20#include <xen/interface/physdev.h>
20#include <xen/features.h> 21#include <xen/features.h>
21 22
@@ -68,6 +69,24 @@ static void __init fiddle_vdso(void)
68 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
69} 70}
70 71
72void xen_enable_sysenter(void)
73{
74 int cpu = smp_processor_id();
75 extern void xen_sysenter_target(void);
76 /* Mask events on entry, even though they get enabled immediately */
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events,
81 };
82
83 if (!boot_cpu_has(X86_FEATURE_SEP) ||
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
87 }
88}
89
71void __init xen_arch_setup(void) 90void __init xen_arch_setup(void)
72{ 91{
73 struct physdev_set_iopl set_iopl; 92 struct physdev_set_iopl set_iopl;
@@ -82,6 +101,8 @@ void __init xen_arch_setup(void)
82 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
83 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 102 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
84 103
104 xen_enable_sysenter();
105
85 set_iopl.iopl = 1; 106 set_iopl.iopl = 1;
86 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
87 if (rc != 0) 108 if (rc != 0)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index e340ff92f6b6..92dd3dbf3ffb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -36,8 +36,9 @@
36#include "mmu.h" 36#include "mmu.h"
37 37
38static cpumask_t xen_cpu_initialized_map; 38static cpumask_t xen_cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq); 39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq); 40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
41 42
42/* 43/*
43 * Structure and data for smp_call_function(). This is designed to minimise 44 * Structure and data for smp_call_function(). This is designed to minimise
@@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
72 int cpu = smp_processor_id(); 73 int cpu = smp_processor_id();
73 74
74 cpu_init(); 75 cpu_init();
76 xen_enable_sysenter();
75 77
76 preempt_disable(); 78 preempt_disable();
77 per_cpu(cpu_state, cpu) = CPU_ONLINE; 79 per_cpu(cpu_state, cpu) = CPU_ONLINE;
@@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
88static int xen_smp_intr_init(unsigned int cpu) 90static int xen_smp_intr_init(unsigned int cpu)
89{ 91{
90 int rc; 92 int rc;
91 const char *resched_name, *callfunc_name; 93 const char *resched_name, *callfunc_name, *debug_name;
92
93 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
94 94
95 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); 95 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
96 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, 96 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu)
114 goto fail; 114 goto fail;
115 per_cpu(callfunc_irq, cpu) = rc; 115 per_cpu(callfunc_irq, cpu) = rc;
116 116
117 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
118 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
119 IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
120 debug_name, NULL);
121 if (rc < 0)
122 goto fail;
123 per_cpu(debug_irq, cpu) = rc;
124
117 return 0; 125 return 0;
118 126
119 fail: 127 fail:
@@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu)
121 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 129 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
122 if (per_cpu(callfunc_irq, cpu) >= 0) 130 if (per_cpu(callfunc_irq, cpu) >= 0)
123 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
124 return rc; 134 return rc;
125} 135}
126 136
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index fe161ed4b01e..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1) 108 RELOC(xen_restore_fl_direct, 2b+1)
109 109
110/* 110/*
111 We can't use sysexit directly, because we're not running in ring0.
112 But we can easily fake it up using iret. Assuming xen_sysexit
113 is jumped to with a standard stack frame, we can just strip it
114 back to a standard iret frame and use iret.
115 */
116ENTRY(xen_sysexit)
117 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
118 orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
119 lea PT_EIP(%esp), %esp
120
121 jmp xen_iret
122ENDPROC(xen_sysexit)
123
124/*
111 This is run where a normal iret would be run, with the same stack setup: 125 This is run where a normal iret would be run, with the same stack setup:
112 8: eflags 126 8: eflags
113 4: cs 127 4: cs
@@ -184,8 +198,12 @@ iret_restore_end:
184 region is OK. */ 198 region is OK. */
185 je xen_hypervisor_callback 199 je xen_hypervisor_callback
186 200
187 iret 2011: iret
188xen_iret_end_crit: 202xen_iret_end_crit:
203.section __ex_table,"a"
204 .align 4
205 .long 1b,iret_exc
206.previous
189 207
190hyper_iret: 208hyper_iret:
191 /* put this out of line since its very rarely used */ 209 /* put this out of line since its very rarely used */
@@ -219,9 +237,7 @@ hyper_iret:
219 ds } SAVE_ALL state 237 ds } SAVE_ALL state
220 eax } 238 eax }
221 : : 239 : :
222 ebx } 240 ebx }<- esp
223 ----------------
224 return addr <- esp
225 ---------------- 241 ----------------
226 242
227 In order to deliver the nested exception properly, we need to shift 243 In order to deliver the nested exception properly, we need to shift
@@ -236,10 +252,8 @@ hyper_iret:
236 it's usermode state which we eventually need to restore. 252 it's usermode state which we eventually need to restore.
237 */ 253 */
238ENTRY(xen_iret_crit_fixup) 254ENTRY(xen_iret_crit_fixup)
239 /* offsets +4 for return address */
240
241 /* 255 /*
242 Paranoia: Make sure we're really coming from userspace. 256 Paranoia: Make sure we're really coming from kernel space.
243 One could imagine a case where userspace jumps into the 257 One could imagine a case where userspace jumps into the
244 critical range address, but just before the CPU delivers a GP, 258 critical range address, but just before the CPU delivers a GP,
245 it decides to deliver an interrupt instead. Unlikely? 259 it decides to deliver an interrupt instead. Unlikely?
@@ -248,32 +262,32 @@ ENTRY(xen_iret_crit_fixup)
248 jump instruction itself, not the destination, but some virtual 262 jump instruction itself, not the destination, but some virtual
249 environments get this wrong. 263 environments get this wrong.
250 */ 264 */
251 movl PT_CS+4(%esp), %ecx 265 movl PT_CS(%esp), %ecx
252 andl $SEGMENT_RPL_MASK, %ecx 266 andl $SEGMENT_RPL_MASK, %ecx
253 cmpl $USER_RPL, %ecx 267 cmpl $USER_RPL, %ecx
254 je 2f 268 je 2f
255 269
256 lea PT_ORIG_EAX+4(%esp), %esi 270 lea PT_ORIG_EAX(%esp), %esi
257 lea PT_EFLAGS+4(%esp), %edi 271 lea PT_EFLAGS(%esp), %edi
258 272
259 /* If eip is before iret_restore_end then stack 273 /* If eip is before iret_restore_end then stack
260 hasn't been restored yet. */ 274 hasn't been restored yet. */
261 cmp $iret_restore_end, %eax 275 cmp $iret_restore_end, %eax
262 jae 1f 276 jae 1f
263 277
264 movl 0+4(%edi),%eax /* copy EAX */ 278 movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
265 movl %eax, PT_EAX+4(%esp) 279 movl %eax, PT_EAX(%esp)
266 280
267 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ 281 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
268 282
269 /* set up the copy */ 283 /* set up the copy */
2701: std 2841: std
271 mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ 285 mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
272 rep movsl 286 rep movsl
273 cld 287 cld
274 288
275 lea 4(%edi),%esp /* point esp to new frame */ 289 lea 4(%edi),%esp /* point esp to new frame */
2762: ret 2902: jmp xen_do_upcall
277 291
278 292
279/* 293/*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 956a491ea998..f1063ae08037 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,8 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/irqreturn.h>
6#include <xen/xen-ops.h>
5 7
6/* These are code, but not functions. Defined in entry.S */ 8/* These are code, but not functions. Defined in entry.S */
7extern const char xen_hypervisor_callback[]; 9extern const char xen_hypervisor_callback[];
@@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[];
9 11
10void xen_copy_trap_info(struct trap_info *traps); 12void xen_copy_trap_info(struct trap_info *traps);
11 13
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3); 14DECLARE_PER_CPU(unsigned long, xen_cr3);
14DECLARE_PER_CPU(unsigned long, xen_current_cr3); 15DECLARE_PER_CPU(unsigned long, xen_current_cr3);
15 16
@@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
19char * __init xen_memory_setup(void); 20char * __init xen_memory_setup(void);
20void __init xen_arch_setup(void); 21void __init xen_arch_setup(void);
21void __init xen_init_IRQ(void); 22void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void);
22 24
23void xen_setup_timer(int cpu); 25void xen_setup_timer(int cpu);
24void xen_setup_cpu_clockevents(void); 26void xen_setup_cpu_clockevents(void);
@@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void);
28int xen_set_wallclock(unsigned long time); 30int xen_set_wallclock(unsigned long time);
29unsigned long long xen_sched_clock(void); 31unsigned long long xen_sched_clock(void);
30 32
33irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
34
31bool xen_vcpu_stolen(int vcpu); 35bool xen_vcpu_stolen(int vcpu);
32 36
33void xen_mark_init_mm_pinned(void); 37void xen_mark_init_mm_pinned(void);
@@ -64,4 +68,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);
64DECL_ASM(void, xen_restore_fl_direct, unsigned long); 68DECL_ASM(void, xen_restore_fl_direct, unsigned long);
65 69
66void xen_iret(void); 70void xen_iret(void);
71void xen_sysexit(void);
72
67#endif /* XEN_OPS_H */ 73#endif /* XEN_OPS_H */